Re: [PATCH v2 2/2] crypto: aegis/generic - fix for big endian systems

2018-10-02 Thread Ondrej Mosnacek
On Mon, Oct 1, 2018 at 10:36 AM Ard Biesheuvel
 wrote:
> Use the correct __le32 annotation and accessors to perform the
> single round of AES encryption performed inside the AEGIS transform.
> Otherwise, tcrypt reports:
>
>   alg: aead: Test 1 failed on encryption for aegis128-generic
>   : 6c 25 25 4a 3c 10 1d 27 2b c1 d4 84 9a ef 7f 6e
>   alg: aead: Test 1 failed on encryption for aegis128l-generic
>   : cd c6 e3 b8 a0 70 9d 8e c2 4f 6f fe 71 42 df 28
>   alg: aead: Test 1 failed on encryption for aegis256-generic
>   : aa ed 07 b1 96 1d e9 e6 f2 ed b5 8e 1c 5f dc 1c
>
> Fixes: f606a88e5823 ("crypto: aegis - Add generic AEGIS AEAD implementations")
> Cc:  # v4.18+
> Signed-off-by: Ard Biesheuvel 

LGTM now, thanks!

Reviewed-by: Ondrej Mosnacek 

> ---
>  crypto/aegis.h | 20 +---
>  1 file changed, 9 insertions(+), 11 deletions(-)
>
> diff --git a/crypto/aegis.h b/crypto/aegis.h
> index f1c6900ddb80..405e025fc906 100644
> --- a/crypto/aegis.h
> +++ b/crypto/aegis.h
> @@ -21,7 +21,7 @@
>
>  union aegis_block {
> __le64 words64[AEGIS_BLOCK_SIZE / sizeof(__le64)];
> -   u32 words32[AEGIS_BLOCK_SIZE / sizeof(u32)];
> +   __le32 words32[AEGIS_BLOCK_SIZE / sizeof(__le32)];
> u8 bytes[AEGIS_BLOCK_SIZE];
>  };
>
> @@ -57,24 +57,22 @@ static void crypto_aegis_aesenc(union aegis_block *dst,
> const union aegis_block *src,
> const union aegis_block *key)
>  {
> -   u32 *d = dst->words32;
> const u8  *s  = src->bytes;
> -   const u32 *k  = key->words32;
> const u32 *t0 = crypto_ft_tab[0];
> const u32 *t1 = crypto_ft_tab[1];
> const u32 *t2 = crypto_ft_tab[2];
> const u32 *t3 = crypto_ft_tab[3];
> u32 d0, d1, d2, d3;
>
> -   d0 = t0[s[ 0]] ^ t1[s[ 5]] ^ t2[s[10]] ^ t3[s[15]] ^ k[0];
> -   d1 = t0[s[ 4]] ^ t1[s[ 9]] ^ t2[s[14]] ^ t3[s[ 3]] ^ k[1];
> -   d2 = t0[s[ 8]] ^ t1[s[13]] ^ t2[s[ 2]] ^ t3[s[ 7]] ^ k[2];
> -   d3 = t0[s[12]] ^ t1[s[ 1]] ^ t2[s[ 6]] ^ t3[s[11]] ^ k[3];
> +   d0 = t0[s[ 0]] ^ t1[s[ 5]] ^ t2[s[10]] ^ t3[s[15]];
> +   d1 = t0[s[ 4]] ^ t1[s[ 9]] ^ t2[s[14]] ^ t3[s[ 3]];
> +   d2 = t0[s[ 8]] ^ t1[s[13]] ^ t2[s[ 2]] ^ t3[s[ 7]];
> +   d3 = t0[s[12]] ^ t1[s[ 1]] ^ t2[s[ 6]] ^ t3[s[11]];
>
> -   d[0] = d0;
> -   d[1] = d1;
> -   d[2] = d2;
> -   d[3] = d3;
> +   dst->words32[0] = cpu_to_le32(d0) ^ key->words32[0];
> +   dst->words32[1] = cpu_to_le32(d1) ^ key->words32[1];
> +   dst->words32[2] = cpu_to_le32(d2) ^ key->words32[2];
> +   dst->words32[3] = cpu_to_le32(d3) ^ key->words32[3];
>  }
>
>  #endif /* _CRYPTO_AEGIS_H */
> --
> 2.17.1
>

-- 
Ondrej Mosnacek 
Associate Software Engineer, Security Technologies
Red Hat, Inc.


Re: [PATCH 2/2] crypto: aegis/generic - fix for big endian systems

2018-10-01 Thread Ondrej Mosnacek
On Mon, Oct 1, 2018 at 10:01 AM Ard Biesheuvel
 wrote:
> On 1 October 2018 at 10:00, Ondrej Mosnacek  wrote:
> > On Sun, Sep 30, 2018 at 1:14 PM Ard Biesheuvel
> >  wrote:
> >> On 30 September 2018 at 10:58, Ard Biesheuvel  
> >> wrote:
> >> > Use the correct __le32 annotation and accessors to perform the
> >> > single round of AES encryption performed inside the AEGIS transform.
> >> > Otherwise, tcrypt reports:
> >> >
> >> >   alg: aead: Test 1 failed on encryption for aegis128-generic
> >> >   : 6c 25 25 4a 3c 10 1d 27 2b c1 d4 84 9a ef 7f 6e
> >> >   alg: aead: Test 1 failed on encryption for aegis128l-generic
> >> >   : cd c6 e3 b8 a0 70 9d 8e c2 4f 6f fe 71 42 df 28
> >> >   alg: aead: Test 1 failed on encryption for aegis256-generic
> >> >   : aa ed 07 b1 96 1d e9 e6 f2 ed b5 8e 1c 5f dc 1c
> >> >
> >> > While at it, let's refer to the first precomputed table only, and
> >> > derive the other ones by rotation. This reduces the D-cache footprint
> >> > by 75%, and shouldn't be too costly or free on load/store architectures
> >> > (and X86 has its own AES-NI based implementation)
> >> >
> >> > Fixes: f606a88e5823 ("crypto: aegis - Add generic AEGIS AEAD 
> >> > implementations")
> >> > Cc:  # v4.18+
> >> > Signed-off-by: Ard Biesheuvel 
> >> > ---
> >> >  crypto/aegis.h | 23 +---
> >> >  1 file changed, 10 insertions(+), 13 deletions(-)
> >> >
> >> > diff --git a/crypto/aegis.h b/crypto/aegis.h
> >> > index f1c6900ddb80..84d3e07a3c33 100644
> >> > --- a/crypto/aegis.h
> >> > +++ b/crypto/aegis.h
> >> > @@ -21,7 +21,7 @@
> >> >
> >> >  union aegis_block {
> >> > __le64 words64[AEGIS_BLOCK_SIZE / sizeof(__le64)];
> >> > -   u32 words32[AEGIS_BLOCK_SIZE / sizeof(u32)];
> >> > +   __le32 words32[AEGIS_BLOCK_SIZE / sizeof(__le32)];
> >> > u8 bytes[AEGIS_BLOCK_SIZE];
> >> >  };
> >> >
> >> > @@ -59,22 +59,19 @@ static void crypto_aegis_aesenc(union aegis_block 
> >> > *dst,
> >> >  {
> >> > u32 *d = dst->words32;
> >> > const u8  *s  = src->bytes;
> >> > -   const u32 *k  = key->words32;
> >> > +   const __le32 *k  = key->words32;
> >> > const u32 *t0 = crypto_ft_tab[0];
> >> > -   const u32 *t1 = crypto_ft_tab[1];
> >> > -   const u32 *t2 = crypto_ft_tab[2];
> >> > -   const u32 *t3 = crypto_ft_tab[3];
> >> > u32 d0, d1, d2, d3;
> >> >
> >> > -   d0 = t0[s[ 0]] ^ t1[s[ 5]] ^ t2[s[10]] ^ t3[s[15]] ^ k[0];
> >> > -   d1 = t0[s[ 4]] ^ t1[s[ 9]] ^ t2[s[14]] ^ t3[s[ 3]] ^ k[1];
> >> > -   d2 = t0[s[ 8]] ^ t1[s[13]] ^ t2[s[ 2]] ^ t3[s[ 7]] ^ k[2];
> >> > -   d3 = t0[s[12]] ^ t1[s[ 1]] ^ t2[s[ 6]] ^ t3[s[11]] ^ k[3];
> >> > +   d0 = t0[s[ 0]] ^ rol32(t0[s[ 5]], 8) ^ rol32(t0[s[10]], 16) ^ 
> >> > rol32(t0[s[15]], 24);
> >> > +   d1 = t0[s[ 4]] ^ rol32(t0[s[ 9]], 8) ^ rol32(t0[s[14]], 16) ^ 
> >> > rol32(t0[s[ 3]], 24);
> >> > +   d2 = t0[s[ 8]] ^ rol32(t0[s[13]], 8) ^ rol32(t0[s[ 2]], 16) ^ 
> >> > rol32(t0[s[ 7]], 24);
> >> > +   d3 = t0[s[12]] ^ rol32(t0[s[ 1]], 8) ^ rol32(t0[s[ 6]], 16) ^ 
> >> > rol32(t0[s[11]], 24);
> >> >
> >> > -   d[0] = d0;
> >> > -   d[1] = d1;
> >> > -   d[2] = d2;
> >> > -   d[3] = d3;
> >> > +   d[0] = cpu_to_le32(d0 ^ le32_to_cpu(k[0]));
> >> > +   d[1] = cpu_to_le32(d1 ^ le32_to_cpu(k[1]));
> >> > +   d[2] = cpu_to_le32(d2 ^ le32_to_cpu(k[2]));
> >> > +   d[3] = cpu_to_le32(d3 ^ le32_to_cpu(k[3]));
> >>
> >>
> >> I suppose this
> >>
> >> > +   d[0] = cpu_to_le32(d0) ^ k[0];
> >> > +   d[1] = cpu_to_le32(d1) ^ k[1];
> >> > +   d[2] = cpu_to_le32(d2) ^ k[2];
> >> > +   d[3] = cpu_to_le32(d3) ^ k[3];
> >>
> >> should work fine as well
> >
> > Yeah, that looks nicer, but I'm not sure if it is completely OK to do
> > bitwise/arithmetic operations directly on the __[lb]e* types...  Maybe
> > yes, but the code I've seen that used them usually seemed to treat
> > them as opaque types.
> >
>
> No, xor is fine with __le/__be types

Ah, OK then.  Good to know :)

-- 
Ondrej Mosnacek 
Associate Software Engineer, Security Technologies
Red Hat, Inc.


Re: [PATCH 2/2] crypto: aegis/generic - fix for big endian systems

2018-10-01 Thread Ondrej Mosnacek
On Sun, Sep 30, 2018 at 1:14 PM Ard Biesheuvel
 wrote:
> On 30 September 2018 at 10:58, Ard Biesheuvel  
> wrote:
> > Use the correct __le32 annotation and accessors to perform the
> > single round of AES encryption performed inside the AEGIS transform.
> > Otherwise, tcrypt reports:
> >
> >   alg: aead: Test 1 failed on encryption for aegis128-generic
> >   : 6c 25 25 4a 3c 10 1d 27 2b c1 d4 84 9a ef 7f 6e
> >   alg: aead: Test 1 failed on encryption for aegis128l-generic
> >   : cd c6 e3 b8 a0 70 9d 8e c2 4f 6f fe 71 42 df 28
> >   alg: aead: Test 1 failed on encryption for aegis256-generic
> >   : aa ed 07 b1 96 1d e9 e6 f2 ed b5 8e 1c 5f dc 1c
> >
> > While at it, let's refer to the first precomputed table only, and
> > derive the other ones by rotation. This reduces the D-cache footprint
> > by 75%, and shouldn't be too costly or free on load/store architectures
> > (and X86 has its own AES-NI based implementation)
> >
> > Fixes: f606a88e5823 ("crypto: aegis - Add generic AEGIS AEAD 
> > implementations")
> > Cc:  # v4.18+
> > Signed-off-by: Ard Biesheuvel 
> > ---
> >  crypto/aegis.h | 23 +---
> >  1 file changed, 10 insertions(+), 13 deletions(-)
> >
> > diff --git a/crypto/aegis.h b/crypto/aegis.h
> > index f1c6900ddb80..84d3e07a3c33 100644
> > --- a/crypto/aegis.h
> > +++ b/crypto/aegis.h
> > @@ -21,7 +21,7 @@
> >
> >  union aegis_block {
> > __le64 words64[AEGIS_BLOCK_SIZE / sizeof(__le64)];
> > -   u32 words32[AEGIS_BLOCK_SIZE / sizeof(u32)];
> > +   __le32 words32[AEGIS_BLOCK_SIZE / sizeof(__le32)];
> > u8 bytes[AEGIS_BLOCK_SIZE];
> >  };
> >
> > @@ -59,22 +59,19 @@ static void crypto_aegis_aesenc(union aegis_block *dst,
> >  {
> > u32 *d = dst->words32;
> > const u8  *s  = src->bytes;
> > -   const u32 *k  = key->words32;
> > +   const __le32 *k  = key->words32;
> > const u32 *t0 = crypto_ft_tab[0];
> > -   const u32 *t1 = crypto_ft_tab[1];
> > -   const u32 *t2 = crypto_ft_tab[2];
> > -   const u32 *t3 = crypto_ft_tab[3];
> > u32 d0, d1, d2, d3;
> >
> > -   d0 = t0[s[ 0]] ^ t1[s[ 5]] ^ t2[s[10]] ^ t3[s[15]] ^ k[0];
> > -   d1 = t0[s[ 4]] ^ t1[s[ 9]] ^ t2[s[14]] ^ t3[s[ 3]] ^ k[1];
> > -   d2 = t0[s[ 8]] ^ t1[s[13]] ^ t2[s[ 2]] ^ t3[s[ 7]] ^ k[2];
> > -   d3 = t0[s[12]] ^ t1[s[ 1]] ^ t2[s[ 6]] ^ t3[s[11]] ^ k[3];
> > +   d0 = t0[s[ 0]] ^ rol32(t0[s[ 5]], 8) ^ rol32(t0[s[10]], 16) ^ 
> > rol32(t0[s[15]], 24);
> > +   d1 = t0[s[ 4]] ^ rol32(t0[s[ 9]], 8) ^ rol32(t0[s[14]], 16) ^ 
> > rol32(t0[s[ 3]], 24);
> > +   d2 = t0[s[ 8]] ^ rol32(t0[s[13]], 8) ^ rol32(t0[s[ 2]], 16) ^ 
> > rol32(t0[s[ 7]], 24);
> > +   d3 = t0[s[12]] ^ rol32(t0[s[ 1]], 8) ^ rol32(t0[s[ 6]], 16) ^ 
> > rol32(t0[s[11]], 24);
> >
> > -   d[0] = d0;
> > -   d[1] = d1;
> > -   d[2] = d2;
> > -   d[3] = d3;
> > +   d[0] = cpu_to_le32(d0 ^ le32_to_cpu(k[0]));
> > +   d[1] = cpu_to_le32(d1 ^ le32_to_cpu(k[1]));
> > +   d[2] = cpu_to_le32(d2 ^ le32_to_cpu(k[2]));
> > +   d[3] = cpu_to_le32(d3 ^ le32_to_cpu(k[3]));
>
>
> I suppose this
>
> > +   d[0] = cpu_to_le32(d0) ^ k[0];
> > +   d[1] = cpu_to_le32(d1) ^ k[1];
> > +   d[2] = cpu_to_le32(d2) ^ k[2];
> > +   d[3] = cpu_to_le32(d3) ^ k[3];
>
> should work fine as well

Yeah, that looks nicer, but I'm not sure if it is completely OK to do
bitwise/arithmetic operations directly on the __[lb]e* types...  Maybe
yes, but the code I've seen that used them usually seemed to treat
them as opaque types.

>
> >  }
> >
> >  #endif /* _CRYPTO_AEGIS_H */
> > --
> > 2.19.0
> >

--
Ondrej Mosnacek 
Associate Software Engineer, Security Technologies
Red Hat, Inc.


Re: [PATCH 2/2] crypto: aegis/generic - fix for big endian systems

2018-10-01 Thread Ondrej Mosnacek
Hi Ard,

On Sun, Sep 30, 2018 at 10:59 AM Ard Biesheuvel
 wrote:
> Use the correct __le32 annotation and accessors to perform the
> single round of AES encryption performed inside the AEGIS transform.
> Otherwise, tcrypt reports:
>
>   alg: aead: Test 1 failed on encryption for aegis128-generic
>   : 6c 25 25 4a 3c 10 1d 27 2b c1 d4 84 9a ef 7f 6e
>   alg: aead: Test 1 failed on encryption for aegis128l-generic
>   : cd c6 e3 b8 a0 70 9d 8e c2 4f 6f fe 71 42 df 28
>   alg: aead: Test 1 failed on encryption for aegis256-generic
>   : aa ed 07 b1 96 1d e9 e6 f2 ed b5 8e 1c 5f dc 1c

Hm...  I think the reason I made a mistake here is that I first had a
version with the AES table hard-coded and I had an #ifdef  #else #endif there with values for little-endian and
big-endian variants.  Then I realized the aes_generic module exports
the crypto_ft_table and rewrote the code to use that.  Somewhere along
the way I forgot to check if the aes_generic table uses the same trick
and correct the code...

It would be nice to apply the same optimization to aes_generic.c, but
unfortunately the current tables are exported so changing the
convention would break external modules that use them :/

>
> While at it, let's refer to the first precomputed table only, and
> derive the other ones by rotation. This reduces the D-cache footprint
> by 75%, and shouldn't be too costly or free on load/store architectures
> (and X86 has its own AES-NI based implementation)

Could you maybe extract this into a separate patch?  I don't think we
should mix functional and performance fixes together.

>
> Fixes: f606a88e5823 ("crypto: aegis - Add generic AEGIS AEAD implementations")
> Cc:  # v4.18+
> Signed-off-by: Ard Biesheuvel 
> ---
>  crypto/aegis.h | 23 +---
>  1 file changed, 10 insertions(+), 13 deletions(-)
>
> diff --git a/crypto/aegis.h b/crypto/aegis.h
> index f1c6900ddb80..84d3e07a3c33 100644
> --- a/crypto/aegis.h
> +++ b/crypto/aegis.h
> @@ -21,7 +21,7 @@
>
>  union aegis_block {
> __le64 words64[AEGIS_BLOCK_SIZE / sizeof(__le64)];
> -   u32 words32[AEGIS_BLOCK_SIZE / sizeof(u32)];
> +   __le32 words32[AEGIS_BLOCK_SIZE / sizeof(__le32)];
> u8 bytes[AEGIS_BLOCK_SIZE];
>  };
>
> @@ -59,22 +59,19 @@ static void crypto_aegis_aesenc(union aegis_block *dst,
>  {
> u32 *d = dst->words32;
> const u8  *s  = src->bytes;
> -   const u32 *k  = key->words32;
> +   const __le32 *k  = key->words32;
> const u32 *t0 = crypto_ft_tab[0];
> -   const u32 *t1 = crypto_ft_tab[1];
> -   const u32 *t2 = crypto_ft_tab[2];
> -   const u32 *t3 = crypto_ft_tab[3];
> u32 d0, d1, d2, d3;
>
> -   d0 = t0[s[ 0]] ^ t1[s[ 5]] ^ t2[s[10]] ^ t3[s[15]] ^ k[0];
> -   d1 = t0[s[ 4]] ^ t1[s[ 9]] ^ t2[s[14]] ^ t3[s[ 3]] ^ k[1];
> -   d2 = t0[s[ 8]] ^ t1[s[13]] ^ t2[s[ 2]] ^ t3[s[ 7]] ^ k[2];
> -   d3 = t0[s[12]] ^ t1[s[ 1]] ^ t2[s[ 6]] ^ t3[s[11]] ^ k[3];
> +   d0 = t0[s[ 0]] ^ rol32(t0[s[ 5]], 8) ^ rol32(t0[s[10]], 16) ^ 
> rol32(t0[s[15]], 24);
> +   d1 = t0[s[ 4]] ^ rol32(t0[s[ 9]], 8) ^ rol32(t0[s[14]], 16) ^ 
> rol32(t0[s[ 3]], 24);
> +   d2 = t0[s[ 8]] ^ rol32(t0[s[13]], 8) ^ rol32(t0[s[ 2]], 16) ^ 
> rol32(t0[s[ 7]], 24);
> +   d3 = t0[s[12]] ^ rol32(t0[s[ 1]], 8) ^ rol32(t0[s[ 6]], 16) ^ 
> rol32(t0[s[11]], 24);
>
> -   d[0] = d0;
> -   d[1] = d1;
> -   d[2] = d2;
> -   d[3] = d3;
> +   d[0] = cpu_to_le32(d0 ^ le32_to_cpu(k[0]));
> +   d[1] = cpu_to_le32(d1 ^ le32_to_cpu(k[1]));
> +   d[2] = cpu_to_le32(d2 ^ le32_to_cpu(k[2]));
> +   d[3] = cpu_to_le32(d3 ^ le32_to_cpu(k[3]));
>  }
>
>  #endif /* _CRYPTO_AEGIS_H */
> --
> 2.19.0
>

Thanks,

-- 
Ondrej Mosnacek 
Associate Software Engineer, Security Technologies
Red Hat, Inc.


Re: [PATCH 1/2] crypto: morus/generic - fix for big endian systems

2018-10-01 Thread Ondrej Mosnacek
On Sun, Sep 30, 2018 at 10:59 AM Ard Biesheuvel
 wrote:
> Omit the endian swabbing when folding the lengths of the assoc and
> crypt input buffers into the state to finalize the tag. This is not
> necessary given that the memory representation of the state is in
> machine native endianness already.
>
> This fixes an error reported by tcrypt running on a big endian system:
>
>   alg: aead: Test 2 failed on encryption for morus640-generic
>   : a8 30 ef fb e6 26 eb 23 b0 87 dd 98 57 f3 e1 4b
>   0010: 21
>   alg: aead: Test 2 failed on encryption for morus1280-generic
>   : 88 19 1b fb 1c 29 49 0e ee 82 2f cb 97 a6 a5 ee
>   0010: 5f

Yikes, I never really got around to test MORUS and AEGIS on a BE
machine...  My mistake, sorry :/

>
> Fixes: 396be41f16fd ("crypto: morus - Add generic MORUS AEAD implementations")
> Cc:  # v4.18+
> Signed-off-by: Ard Biesheuvel 

Reviewed-by: Ondrej Mosnacek 

> ---
>  crypto/morus1280.c |  7 ++-
>  crypto/morus640.c  | 16 
>  2 files changed, 6 insertions(+), 17 deletions(-)
>
> diff --git a/crypto/morus1280.c b/crypto/morus1280.c
> index d057cf5ac4a8..3889c188f266 100644
> --- a/crypto/morus1280.c
> +++ b/crypto/morus1280.c
> @@ -385,14 +385,11 @@ static void crypto_morus1280_final(struct 
> morus1280_state *state,
>struct morus1280_block *tag_xor,
>u64 assoclen, u64 cryptlen)
>  {
> -   u64 assocbits = assoclen * 8;
> -   u64 cryptbits = cryptlen * 8;
> -
> struct morus1280_block tmp;
> unsigned int i;
>
> -   tmp.words[0] = cpu_to_le64(assocbits);
> -   tmp.words[1] = cpu_to_le64(cryptbits);
> +   tmp.words[0] = assoclen * 8;
> +   tmp.words[1] = cryptlen * 8;
> tmp.words[2] = 0;
> tmp.words[3] = 0;
>
> diff --git a/crypto/morus640.c b/crypto/morus640.c
> index 1ca76e54281b..da06ec2f6a80 100644
> --- a/crypto/morus640.c
> +++ b/crypto/morus640.c
> @@ -384,21 +384,13 @@ static void crypto_morus640_final(struct morus640_state 
> *state,
>   struct morus640_block *tag_xor,
>   u64 assoclen, u64 cryptlen)
>  {
> -   u64 assocbits = assoclen * 8;
> -   u64 cryptbits = cryptlen * 8;
> -
> -   u32 assocbits_lo = (u32)assocbits;
> -   u32 assocbits_hi = (u32)(assocbits >> 32);
> -   u32 cryptbits_lo = (u32)cryptbits;
> -   u32 cryptbits_hi = (u32)(cryptbits >> 32);
> -
> struct morus640_block tmp;
> unsigned int i;
>
> -   tmp.words[0] = cpu_to_le32(assocbits_lo);
> -   tmp.words[1] = cpu_to_le32(assocbits_hi);
> -   tmp.words[2] = cpu_to_le32(cryptbits_lo);
> -   tmp.words[3] = cpu_to_le32(cryptbits_hi);
> +   tmp.words[0] = lower_32_bits(assoclen * 8);
> +   tmp.words[1] = upper_32_bits(assoclen * 8);
> +   tmp.words[2] = lower_32_bits(cryptlen * 8);
> +   tmp.words[3] = upper_32_bits(cryptlen * 8);
>
> for (i = 0; i < MORUS_BLOCK_WORDS; i++)
> state->s[4].words[i] ^= state->s[0].words[i];
> --
> 2.19.0
>

Thanks,

--
Ondrej Mosnacek 
Associate Software Engineer, Security Technologies
Red Hat, Inc.


Re: [PATCH] crypto: lrw - fix rebase error after out of bounds fix

2018-10-01 Thread Ondrej Mosnacek
On Sun, Sep 30, 2018 at 9:51 PM Ard Biesheuvel
 wrote:
> Due to an unfortunate interaction between commit fbe1a850b3b1
> ("crypto: lrw - Fix out-of bounds access on counter overflow") and
> commit c778f96bf347 ("crypto: lrw - Optimize tweak computation"),
> we ended up with a version of next_index() that always returns 127.

Ouch, thanks for taking your time to investigate it and fix it!  You
are spot on that it was a rebase error...  I need to get better at
*always* testing what I post :/

>
> Fixes: c778f96bf347 ("crypto: lrw - Optimize tweak computation")
> Signed-off-by: Ard Biesheuvel 

Reviewed-by: Ondrej Mosnacek 

> ---
>  crypto/lrw.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/crypto/lrw.c b/crypto/lrw.c
> index 6fcf0d431185..0430ccd08728 100644
> --- a/crypto/lrw.c
> +++ b/crypto/lrw.c
> @@ -122,10 +122,9 @@ static int next_index(u32 *counter)
> int i, res = 0;
>
> for (i = 0; i < 4; i++) {
> -   if (counter[i] + 1 != 0) {
> -   res += ffz(counter[i]++);
> -   break;
> -   }
> +   if (counter[i] + 1 != 0)
> +   return res + ffz(counter[i]++);
> +
> counter[i] = 0;
> res += 32;
> }
> --
> 2.17.1
>

--
Ondrej Mosnacek 
Associate Software Engineer, Security Technologies
Red Hat, Inc.


[PATCH] crypto: x86/aegis,morus - Do not require OSXSAVE for SSE2

2018-09-05 Thread Ondrej Mosnacek
It turns out OSXSAVE needs to be checked only for AVX, not for SSE.
Without this patch the affected modules refuse to load on CPUs with SSE2
but without AVX support.

Fixes: 877ccce7cbe8 ("crypto: x86/aegis,morus - Fix and simplify CPUID checks")
Cc:  # 4.18
Reported-by: Zdenek Kaspar 
Signed-off-by: Ondrej Mosnacek 
---
 arch/x86/crypto/aegis128-aesni-glue.c  | 1 -
 arch/x86/crypto/aegis128l-aesni-glue.c | 1 -
 arch/x86/crypto/aegis256-aesni-glue.c  | 1 -
 arch/x86/crypto/morus1280-sse2-glue.c  | 1 -
 arch/x86/crypto/morus640-sse2-glue.c   | 1 -
 5 files changed, 5 deletions(-)

NOTE: This patch doesn't apply to the current crypto-2.6 tree [1], since
it is based on v4.18-rc7, which doesn't contain the commit this patch is
fixing. It needs to merge in the v4.18 tag before this patch can be
applied.

Thanks,
Ondrej

[1] https://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6.git/

diff --git a/arch/x86/crypto/aegis128-aesni-glue.c 
b/arch/x86/crypto/aegis128-aesni-glue.c
index acd11b3bf639..2a356b948720 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -379,7 +379,6 @@ static int __init crypto_aegis128_aesni_module_init(void)
 {
if (!boot_cpu_has(X86_FEATURE_XMM2) ||
!boot_cpu_has(X86_FEATURE_AES) ||
-   !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
 
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c 
b/arch/x86/crypto/aegis128l-aesni-glue.c
index 2071c3d1ae07..dbe8bb980da1 100644
--- a/arch/x86/crypto/aegis128l-aesni-glue.c
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -379,7 +379,6 @@ static int __init crypto_aegis128l_aesni_module_init(void)
 {
if (!boot_cpu_has(X86_FEATURE_XMM2) ||
!boot_cpu_has(X86_FEATURE_AES) ||
-   !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
 
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c 
b/arch/x86/crypto/aegis256-aesni-glue.c
index b5f2a8fd5a71..8bebda2de92f 100644
--- a/arch/x86/crypto/aegis256-aesni-glue.c
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -379,7 +379,6 @@ static int __init crypto_aegis256_aesni_module_init(void)
 {
if (!boot_cpu_has(X86_FEATURE_XMM2) ||
!boot_cpu_has(X86_FEATURE_AES) ||
-   !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
 
diff --git a/arch/x86/crypto/morus1280-sse2-glue.c 
b/arch/x86/crypto/morus1280-sse2-glue.c
index 95cf857d2cbb..f40244eaf14d 100644
--- a/arch/x86/crypto/morus1280-sse2-glue.c
+++ b/arch/x86/crypto/morus1280-sse2-glue.c
@@ -40,7 +40,6 @@ MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350);
 static int __init crypto_morus1280_sse2_module_init(void)
 {
if (!boot_cpu_has(X86_FEATURE_XMM2) ||
-   !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
 
diff --git a/arch/x86/crypto/morus640-sse2-glue.c 
b/arch/x86/crypto/morus640-sse2-glue.c
index 615fb7bc9a32..9afaf8f8565a 100644
--- a/arch/x86/crypto/morus640-sse2-glue.c
+++ b/arch/x86/crypto/morus640-sse2-glue.c
@@ -40,7 +40,6 @@ MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400);
 static int __init crypto_morus640_sse2_module_init(void)
 {
if (!boot_cpu_has(X86_FEATURE_XMM2) ||
-   !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
 
-- 
2.17.1



[PATCH] crypto: vmx - Fix sleep-in-atomic bugs

2018-08-21 Thread Ondrej Mosnacek
This patch fixes sleep-in-atomic bugs in AES-CBC and AES-XTS VMX
implementations. The problem is that the blkcipher_* functions should
not be called in atomic context.

The bugs can be reproduced via the AF_ALG interface by trying to
encrypt/decrypt sufficiently large buffers (at least 64 KiB) using the
VMX implementations of 'cbc(aes)' or 'xts(aes)'. Such operations then
trigger BUG in crypto_yield():

[  891.863680] BUG: sleeping function called from invalid context at 
include/crypto/algapi.h:424
[  891.864622] in_atomic(): 1, irqs_disabled(): 0, pid: 12347, name: kcapi-enc
[  891.864739] 1 lock held by kcapi-enc/12347:
[  891.864811]  #0: f5d42c46 (sk_lock-AF_ALG){+.+.}, at: 
skcipher_recvmsg+0x50/0x530
[  891.865076] CPU: 5 PID: 12347 Comm: kcapi-enc Not tainted 
4.19.0-0.rc0.git3.1.fc30.ppc64le #1
[  891.865251] Call Trace:
[  891.865340] [c003387578c0] [c0d67ea4] dump_stack+0xe8/0x164 
(unreliable)
[  891.865511] [c00338757910] [c0172a58] ___might_sleep+0x2f8/0x310
[  891.865679] [c00338757990] [c06bff74] 
blkcipher_walk_done+0x374/0x4a0
[  891.865825] [c003387579e0] [d7e73e70] 
p8_aes_cbc_encrypt+0x1c8/0x260 [vmx_crypto]
[  891.865993] [c00338757ad0] [c06c0ee0] 
skcipher_encrypt_blkcipher+0x60/0x80
[  891.866128] [c00338757b10] [c06ec504] 
skcipher_recvmsg+0x424/0x530
[  891.866283] [c00338757bd0] [c0b00654] sock_recvmsg+0x74/0xa0
[  891.866403] [c00338757c10] [c0b00f64] ___sys_recvmsg+0xf4/0x2f0
[  891.866515] [c00338757d90] [c0b02bb8] __sys_recvmsg+0x68/0xe0
[  891.866631] [c00338757e30] [c000bbe4] system_call+0x5c/0x70

Fixes: 8c755ace357c ("crypto: vmx - Adding CBC routines for VMX module")
Fixes: c07f5d3da643 ("crypto: vmx - Adding support for XTS")
Cc: sta...@vger.kernel.org
Signed-off-by: Ondrej Mosnacek 
---
This patch should fix the issue, but I didn't test it. (I'll see if I
can find some time tomorrow to try and recompile the kernel on a PPC
machine... in the meantime please review :)

 drivers/crypto/vmx/aes_cbc.c | 30 ++
 drivers/crypto/vmx/aes_xts.c | 19 ---
 2 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
index 5285ece4f33a..b71895871be3 100644
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
@@ -107,24 +107,23 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
ret = crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
} else {
-   preempt_disable();
-   pagefault_disable();
-   enable_kernel_vsx();
-
blkcipher_walk_init(, dst, src, nbytes);
ret = blkcipher_walk_virt(desc, );
while ((nbytes = walk.nbytes)) {
+   preempt_disable();
+   pagefault_disable();
+   enable_kernel_vsx();
aes_p8_cbc_encrypt(walk.src.virt.addr,
   walk.dst.virt.addr,
   nbytes & AES_BLOCK_MASK,
   >enc_key, walk.iv, 1);
+   disable_kernel_vsx();
+   pagefault_enable();
+   preempt_enable();
+
nbytes &= AES_BLOCK_SIZE - 1;
ret = blkcipher_walk_done(desc, , nbytes);
}
-
-   disable_kernel_vsx();
-   pagefault_enable();
-   preempt_enable();
}
 
return ret;
@@ -147,24 +146,23 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
ret = crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
} else {
-   preempt_disable();
-   pagefault_disable();
-   enable_kernel_vsx();
-
blkcipher_walk_init(, dst, src, nbytes);
ret = blkcipher_walk_virt(desc, );
while ((nbytes = walk.nbytes)) {
+   preempt_disable();
+   pagefault_disable();
+   enable_kernel_vsx();
aes_p8_cbc_encrypt(walk.src.virt.addr,
   walk.dst.virt.addr,
   nbytes & AES_BLOCK_MASK,
   >dec_key, walk.iv, 0);
+   disable_kernel_vsx();
+   pagefault_enable();
+   preempt_enable();
+
nbytes &= AES_BLOCK_SIZE - 1;
ret = blkcipher_walk_done(desc, , nbytes);
}
-
-   disable_kernel_vsx();
-   pagefault_enable();
-   preempt_enable();
}
 
r

[PATCH v2] crypto: x86/aegis,morus - Fix and simplify CPUID checks

2018-08-03 Thread Ondrej Mosnacek
It turns out I had misunderstood how the x86_match_cpu() function works.
It evaluates a logical OR of the matching conditions, not logical AND.
This caused the CPU feature checks for AEGIS to pass even if only SSE2
(but not AES-NI) was supported (or vice versa), leading to potential
crashes if something tried to use the registered algs.

This patch switches the checks to a simpler method that is used e.g. in
the Camellia x86 code.

The patch also removes the MODULE_DEVICE_TABLE declarations which
actually seem to cause the modules to be auto-loaded at boot, which is
not desired. The crypto API on-demand module loading is sufficient.

Fixes: 1d373d4e8e15 ("crypto: x86 - Add optimized AEGIS implementations")
Fixes: 6ecc9d9ff91f ("crypto: x86 - Add optimized MORUS implementations")
Signed-off-by: Ondrej Mosnacek 
---
 arch/x86/crypto/aegis128-aesni-glue.c  | 12 
 arch/x86/crypto/aegis128l-aesni-glue.c | 12 
 arch/x86/crypto/aegis256-aesni-glue.c  | 12 
 arch/x86/crypto/morus1280-avx2-glue.c  | 10 +++---
 arch/x86/crypto/morus1280-sse2-glue.c  | 10 +++---
 arch/x86/crypto/morus640-sse2-glue.c   | 10 +++---
 6 files changed, 21 insertions(+), 45 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-glue.c 
b/arch/x86/crypto/aegis128-aesni-glue.c
index 5de7c0d46edf..acd11b3bf639 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128_aesni_alg[] = {
}
 };
 
-static const struct x86_cpu_id aesni_cpu_id[] = {
-   X86_FEATURE_MATCH(X86_FEATURE_AES),
-   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
-   {}
-};
-MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
-
 static int __init crypto_aegis128_aesni_module_init(void)
 {
-   if (!x86_match_cpu(aesni_cpu_id))
+   if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+   !boot_cpu_has(X86_FEATURE_AES) ||
+   !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
+   !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
 
return crypto_register_aeads(crypto_aegis128_aesni_alg,
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c 
b/arch/x86/crypto/aegis128l-aesni-glue.c
index 876e4866e633..2071c3d1ae07 100644
--- a/arch/x86/crypto/aegis128l-aesni-glue.c
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128l_aesni_alg[] = {
}
 };
 
-static const struct x86_cpu_id aesni_cpu_id[] = {
-   X86_FEATURE_MATCH(X86_FEATURE_AES),
-   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
-   {}
-};
-MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
-
 static int __init crypto_aegis128l_aesni_module_init(void)
 {
-   if (!x86_match_cpu(aesni_cpu_id))
+   if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+   !boot_cpu_has(X86_FEATURE_AES) ||
+   !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
+   !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
 
return crypto_register_aeads(crypto_aegis128l_aesni_alg,
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c 
b/arch/x86/crypto/aegis256-aesni-glue.c
index 2b5dd3af8f4d..b5f2a8fd5a71 100644
--- a/arch/x86/crypto/aegis256-aesni-glue.c
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis256_aesni_alg[] = {
}
 };
 
-static const struct x86_cpu_id aesni_cpu_id[] = {
-   X86_FEATURE_MATCH(X86_FEATURE_AES),
-   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
-   {}
-};
-MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
-
 static int __init crypto_aegis256_aesni_module_init(void)
 {
-   if (!x86_match_cpu(aesni_cpu_id))
+   if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+   !boot_cpu_has(X86_FEATURE_AES) ||
+   !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
+   !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
 
return crypto_register_aeads(crypto_aegis256_aesni_alg,
diff --git a/arch/x86/crypto/morus1280-avx2-glue.c 
b/arch/x86/crypto/morus1280-avx2-glue.c
index f111f36d26dc..6634907d6ccd 100644
--- a/arch/x86/crypto/morus1280-avx2-glue.c
+++ b/arch/x86/crypto/morus1280-avx2-glue.c
@@ -37,15 +37,11 @@ asmlinkage void crypto_morus1280_avx2_final(void *state, 
void *tag_xor,
 
 MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400);
 
-static const struct x86_cpu_id avx2_cpu_id[] = {
-X86_FEATURE_MATCH(X86_FEATURE_AVX2),
-{}
-};
-MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id);
-
 static int __init crypto_morus1280_avx2_module_init(void)
 {
-   if (!x86_match_cpu(avx2_cpu_id))
+   if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+   !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
+   !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
return -ENODEV;
 
return crypto_register_aeads(crypto_morus1280_avx2_algs,
diff --git a/arch/x86/crypto/morus1280-sse2-glue.c 
b/arch/x86/crypto/morus1280-sse2-glue.c
in

Re: [PATCH] crypto: x86/aegis - Fix CPUID checks

2018-08-03 Thread Ondrej Mosnacek
On Thu, Aug 2, 2018 at 11:17 AM Ondrej Mosnacek  wrote:
> It turns out I had misunderstood how the x86_match_cpu() function works.
> It evaluates a logical OR of the matching conditions, not logical AND.
> This caused the CPU feature checks to pass even if only SSE2 (but not
> AES-NI) was supported (ir vice versa), leading to potential crashes if
> something tried to use the registered algs.
>
> This patch fixes the checks to ensure that both required CPU features
> are supported. The MODULE_DEVICE_TABLE declaration is specified only for
> the AES-NI feature array, because I'm not sure what having multiple such
> declarations would cause and I believe it should suffice to match on the
> more important feature at the alias level and let the init routine do
> the full check.
>
> Signed-off-by: Ondrej Mosnacek 
> ---
> Hi Herbert,
>
> this patch fixes the CPU checks of the AEGIS AES-NI/SSE2 implementations
> that have been introduced in 4.18. Once reviewed, it should go to Linus'
> tree since it may cause crashes on some systems if the corresponding
> configs are enabled.
>
> @x86 folks, please take a look if I use the MODULE_DEVICE_TABLE macro
> correctly here (I'm not sure how to properly declare that the module
> needs two CPU features to be both supported; all other modules I saw had
> either only single match rule or required just one of the rules to
> match).

Never mind, I realized MODULE_DEVICE_TABLE isn't actually needed at
all here (most modules in arch/x86/crypto don't even use it) and IIUC
it will cause the modules to load automatically on boot (which isn't
desirable for these new algorithms).

I'll send a v2 of the patch that converts both AEGIS and MORUS checks
to what most other modules use (e.g. the Camellia x86 optimizations).

>
> Thanks,
>
> Ondrej Mosnacek
>
>  arch/x86/crypto/aegis128-aesni-glue.c  | 8 ++--
>  arch/x86/crypto/aegis128l-aesni-glue.c | 8 ++--
>  arch/x86/crypto/aegis256-aesni-glue.c  | 8 ++--
>  3 files changed, 18 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/crypto/aegis128-aesni-glue.c 
> b/arch/x86/crypto/aegis128-aesni-glue.c
> index 5de7c0d46edf..6a5abed59593 100644
> --- a/arch/x86/crypto/aegis128-aesni-glue.c
> +++ b/arch/x86/crypto/aegis128-aesni-glue.c
> @@ -377,14 +377,18 @@ static struct aead_alg crypto_aegis128_aesni_alg[] = {
>
>  static const struct x86_cpu_id aesni_cpu_id[] = {
> X86_FEATURE_MATCH(X86_FEATURE_AES),
> -   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
> {}
>  };
>  MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
>
> +static const struct x86_cpu_id sse2_cpu_id[] = {
> +   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
> +   {}
> +};
> +
>  static int __init crypto_aegis128_aesni_module_init(void)
>  {
> -   if (!x86_match_cpu(aesni_cpu_id))
> +   if (!x86_match_cpu(aesni_cpu_id) || !x86_match_cpu(sse2_cpu_id))
> return -ENODEV;
>
> return crypto_register_aeads(crypto_aegis128_aesni_alg,
> diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c 
> b/arch/x86/crypto/aegis128l-aesni-glue.c
> index 876e4866e633..691c52701c2d 100644
> --- a/arch/x86/crypto/aegis128l-aesni-glue.c
> +++ b/arch/x86/crypto/aegis128l-aesni-glue.c
> @@ -377,14 +377,18 @@ static struct aead_alg crypto_aegis128l_aesni_alg[] = {
>
>  static const struct x86_cpu_id aesni_cpu_id[] = {
> X86_FEATURE_MATCH(X86_FEATURE_AES),
> -   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
> {}
>  };
>  MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
>
> +static const struct x86_cpu_id sse2_cpu_id[] = {
> +   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
> +   {}
> +};
> +
>  static int __init crypto_aegis128l_aesni_module_init(void)
>  {
> -   if (!x86_match_cpu(aesni_cpu_id))
> +   if (!x86_match_cpu(aesni_cpu_id) || !x86_match_cpu(sse2_cpu_id))
> return -ENODEV;
>
> return crypto_register_aeads(crypto_aegis128l_aesni_alg,
> diff --git a/arch/x86/crypto/aegis256-aesni-glue.c 
> b/arch/x86/crypto/aegis256-aesni-glue.c
> index 2b5dd3af8f4d..481b5e4f4fd0 100644
> --- a/arch/x86/crypto/aegis256-aesni-glue.c
> +++ b/arch/x86/crypto/aegis256-aesni-glue.c
> @@ -377,14 +377,18 @@ static struct aead_alg crypto_aegis256_aesni_alg[] = {
>
>  static const struct x86_cpu_id aesni_cpu_id[] = {
> X86_FEATURE_MATCH(X86_FEATURE_AES),
> -   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
> {}
>  };
>  MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
>
> +static const struct x86_cpu_id sse2_cpu_id[] = {
> +   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
> +   {}
> +};
> +
>  static int __init crypto_aegis256_aesni_module_init(void)
>  {
> -   if (!x86_match_cpu(aesni_cpu_id))
> +   if (!x86_match_cpu(aesni_cpu_id) || !x86_match_cpu(sse2_cpu_id))
> return -ENODEV;
>
> return crypto_register_aeads(crypto_aegis256_aesni_alg,
> --
> 2.17.1
>

-- 
Ondrej Mosnacek 
Associate Software Engineer, Security Technologies
Red Hat, Inc.


[PATCH] crypto: x86/aegis - Fix CPUID checks

2018-08-02 Thread Ondrej Mosnacek
It turns out I had misunderstood how the x86_match_cpu() function works.
It evaluates a logical OR of the matching conditions, not logical AND.
This caused the CPU feature checks to pass even if only SSE2 (but not
AES-NI) was supported (ir vice versa), leading to potential crashes if
something tried to use the registered algs.

This patch fixes the checks to ensure that both required CPU features
are supported. The MODULE_DEVICE_TABLE declaration is specified only for
the AES-NI feature array, because I'm not sure what having multiple such
declarations would cause and I believe it should suffice to match on the
more important feature at the alias level and let the init routine do
the full check.

Signed-off-by: Ondrej Mosnacek 
---
Hi Herbert,

this patch fixes the CPU checks of the AEGIS AES-NI/SSE2 implementations
that have been introduced in 4.18. Once reviewed, it should go to Linus'
tree since it may cause crashes on some systems if the corresponding
configs are enabled.

@x86 folks, please take a look if I use the MODULE_DEVICE_TABLE macro
correctly here (I'm not sure how to properly declare that the module
needs two CPU features to be both supported; all other modules I saw had
either only single match rule or required just one of the rules to
match).

Thanks,

Ondrej Mosnacek

 arch/x86/crypto/aegis128-aesni-glue.c  | 8 ++--
 arch/x86/crypto/aegis128l-aesni-glue.c | 8 ++--
 arch/x86/crypto/aegis256-aesni-glue.c  | 8 ++--
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-glue.c 
b/arch/x86/crypto/aegis128-aesni-glue.c
index 5de7c0d46edf..6a5abed59593 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -377,14 +377,18 @@ static struct aead_alg crypto_aegis128_aesni_alg[] = {
 
 static const struct x86_cpu_id aesni_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_AES),
-   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
{}
 };
 MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
 
+static const struct x86_cpu_id sse2_cpu_id[] = {
+   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+   {}
+};
+
 static int __init crypto_aegis128_aesni_module_init(void)
 {
-   if (!x86_match_cpu(aesni_cpu_id))
+   if (!x86_match_cpu(aesni_cpu_id) || !x86_match_cpu(sse2_cpu_id))
return -ENODEV;
 
return crypto_register_aeads(crypto_aegis128_aesni_alg,
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c 
b/arch/x86/crypto/aegis128l-aesni-glue.c
index 876e4866e633..691c52701c2d 100644
--- a/arch/x86/crypto/aegis128l-aesni-glue.c
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -377,14 +377,18 @@ static struct aead_alg crypto_aegis128l_aesni_alg[] = {
 
 static const struct x86_cpu_id aesni_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_AES),
-   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
{}
 };
 MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
 
+static const struct x86_cpu_id sse2_cpu_id[] = {
+   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+   {}
+};
+
 static int __init crypto_aegis128l_aesni_module_init(void)
 {
-   if (!x86_match_cpu(aesni_cpu_id))
+   if (!x86_match_cpu(aesni_cpu_id) || !x86_match_cpu(sse2_cpu_id))
return -ENODEV;
 
return crypto_register_aeads(crypto_aegis128l_aesni_alg,
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c 
b/arch/x86/crypto/aegis256-aesni-glue.c
index 2b5dd3af8f4d..481b5e4f4fd0 100644
--- a/arch/x86/crypto/aegis256-aesni-glue.c
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -377,14 +377,18 @@ static struct aead_alg crypto_aegis256_aesni_alg[] = {
 
 static const struct x86_cpu_id aesni_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_AES),
-   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
{}
 };
 MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
 
+static const struct x86_cpu_id sse2_cpu_id[] = {
+   X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+   {}
+};
+
 static int __init crypto_aegis256_aesni_module_init(void)
 {
-   if (!x86_match_cpu(aesni_cpu_id))
+   if (!x86_match_cpu(aesni_cpu_id) || !x86_match_cpu(sse2_cpu_id))
return -ENODEV;
 
return crypto_register_aeads(crypto_aegis256_aesni_alg,
-- 
2.17.1



[PATCH] crypto: morus640 - Fix out-of-bounds access

2018-06-13 Thread Ondrej Mosnacek
We must load the block from the temporary variable here, not directly
from the input.

Also add forgotten zeroing-out of the uninitialized part of the
temporary block (as is done correctly in morus1280.c).

Fixes: 396be41f16fd ("crypto: morus - Add generic MORUS AEAD implementations")
Reported-by: syzbot+1fafa9c4cf42df33f...@syzkaller.appspotmail.com
Reported-by: syzbot+d82643ba80bf6937c...@syzkaller.appspotmail.com
Signed-off-by: Ondrej Mosnacek 
---
 crypto/morus640.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crypto/morus640.c b/crypto/morus640.c
index 9fbcde307daf..5eede3749e64 100644
--- a/crypto/morus640.c
+++ b/crypto/morus640.c
@@ -274,8 +274,9 @@ static void crypto_morus640_decrypt_chunk(struct 
morus640_state *state, u8 *dst,
union morus640_block_in tail;
 
memcpy(tail.bytes, src, size);
+   memset(tail.bytes + size, 0, MORUS640_BLOCK_SIZE - size);
 
-   crypto_morus640_load_a(, src);
+   crypto_morus640_load_a(, tail.bytes);
crypto_morus640_core(state, );
crypto_morus640_store_a(tail.bytes, );
memset(tail.bytes + size, 0, MORUS640_BLOCK_SIZE - size);
-- 
2.17.1



[PATCH] crypto: skcipher - Fix skcipher_walk_aead_common

2017-11-23 Thread Ondrej Mosnacek
The skcipher_walk_aead_common function calls scatterwalk_copychunks on
the input and output walks to skip the associated data. If the AD end
at an SG list entry boundary, then after these calls the walks will
still be pointing to the end of the skipped region.

These offsets are later checked for alignment in skcipher_walk_next,
so the skcipher_walk may detect the alignment incorrectly.

This patch fixes it by calling scatterwalk_done after the copychunks
calls to ensure that the offsets refer to the right SG list entry.

Fixes: b286d8b1a690 ("crypto: skcipher - Add skcipher walk interface")
Cc: <sta...@vger.kernel.org>
Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
 crypto/skcipher.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 4faa0fd53b0c..6c45ed536664 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -517,6 +517,9 @@ static int skcipher_walk_aead_common(struct skcipher_walk 
*walk,
scatterwalk_copychunks(NULL, >in, req->assoclen, 2);
scatterwalk_copychunks(NULL, >out, req->assoclen, 2);
 
+   scatterwalk_done(>in, 0, walk->total);
+   scatterwalk_done(>out, 0, walk->total);
+
walk->iv = req->iv;
walk->oiv = req->iv;
 
-- 
2.14.1



Re: dm-crypt IV generation (summary)

2017-05-18 Thread Ondrej Mosnacek
2017-04-07 8:12 GMT+02:00 Herbert Xu <herb...@gondor.apana.org.au>:
> On Fri, Mar 10, 2017 at 02:44:26PM +0100, Ondrej Mosnacek wrote:
>>
>> ISSUES:
>> a) The 'keycount' parameter.
>> In order to support multi-key modes from Loop-AES,
>> dm-crypt accepts a keycount parameter which, if it != 1, causes
>> consecutive sectors to be encrypted with a different key. This
>> parameter can be specified with any of the cipher modes, which makes
>> porting the whole scale of modes supported by dm-crypt to crypto API
>> rather messy, since a lot of dm-crypt specific stuff needs to be moved
>> into the crypto drivers.
>
> Actually I think this one can probably easily handled in the crypto
> layer.  All we need is to add a multikey template that sits on top
> of an underlying IV generator.  The multikey instance can then accept
> a key length that is a multiple of the underlying key length.

I thought of that, too, but unfortunately with TCW the key structure is:

| KEY 1 | KEY 2 | ... | KEY n | IV SEED (size = IV size) | WHITENING
(size = 16 bytes) |

So part of the key needs to be processed before it is split into multiple keys.

Also with the LMK mode, there is a similar optional key appendix,
which is of the same size as the other subkeys.

>> b) New AEAD functionality; random IV generator.
>> The soon-to-be-added AEAD functionality in dm-crypt
>> introduces a new IV generator that generates IVs randomly and stores
>> them as sector metadata. This means IV generation cannot be handled
>> solely in the driver. Also, additional AEAD implementation of IV
>> generators would be eventually needed.
>
> Again I don't see the problem here.  IV generators are supposed
> to return the IV to the caller so that it can be transmitted.
>
> For example, the IV generated in IPsec is explicitly transmitted.
> Here we just need to store the IV.

My point is that it doesn't make much sense to have a crypto API alg
that calls get_random_bytes() as part of its implementation. IMHO,
that might tempt HW drivers to replace it with some crappy
alternatives, which wouldn't be good... Also, how would we test such
alg with testmgr?

I would say the best solution in this case would be to treat the
'random' IV mode as a special case in dm-crypt: the IVs would be
generated in dm-crypt and passed to a special template (which would
have a similar interface as the other IV generation templates) that
would just pass the generated IVs to underlying skciphers instead of
generating them from the sequence number. This template would need to
provide some way to pass the IVs, e.g. by prepending them to the
plaintext/ciphertext.

Cheers,
Ondrej


[PATCH v5 3/4] crypto: glue_helper - remove the le128_gf128mul_x_ble function

2017-04-02 Thread Ondrej Mosnacek
The le128_gf128mul_x_ble function in glue_helper.h is now obsolete and
can be replaced with the gf128mul_x_ble function from gf128mul.h.

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
 arch/x86/crypto/glue_helper.c |  3 ++-
 arch/x86/include/asm/crypto/glue_helper.h | 10 --
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 260a060..24ac9fa 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -27,6 +27,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -457,7 +458,7 @@ void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const 
u128 *src, le128 *iv,
le128 ivblk = *iv;
 
/* generate next IV */
-   le128_gf128mul_x_ble(iv, );
+   gf128mul_x_ble(iv, );
 
/* CC <- T xor C */
u128_xor(dst, src, (u128 *));
diff --git a/arch/x86/include/asm/crypto/glue_helper.h 
b/arch/x86/include/asm/crypto/glue_helper.h
index 29e53ea..ed8b66d 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -125,16 +125,6 @@ static inline void le128_inc(le128 *i)
i->b = cpu_to_le64(b);
 }
 
-static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src)
-{
-   u64 a = le64_to_cpu(src->a);
-   u64 b = le64_to_cpu(src->b);
-   u64 _tt = ((s64)a >> 63) & 0x87;
-
-   dst->a = cpu_to_le64((a << 1) ^ (b >> 63));
-   dst->b = cpu_to_le64((b << 1) ^ _tt);
-}
-
 extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
 struct blkcipher_desc *desc,
 struct scatterlist *dst,
-- 
2.9.3



[PATCH v5 0/4] gf128mul refactoring

2017-04-02 Thread Ondrej Mosnacek
This patchset contains the following gf128mul-related changes:
  1. The gf128mul_x_* functions are moved to gf128mul.h for performance reasons.
  2. The gf128mul_x_ble function is fixed to use the correct block type.
  3. The le128_gf128mul_x_ble function from glue_helper.h is removed and its
 usages replaced with gf128mul_x_ble calls.
  4. The now obsolete dependency of CRYPTO_XTS on CRYPTO_GF128MUL is removed.

v4 -> v5: added the other three patches
v3 -> v4: a faster version of gf128mul_x_lle
v2 -> v3: constant-time implementation
v1 -> v2: move all _x_ functions to the header, not just gf128mul_x_ble

Ondrej Mosnacek (4):
  crypto: gf128mul - define gf128mul_x_* in gf128mul.h
  crypto: gf128mul - switch gf128mul_x_ble to le128
  crypto: glue_helper - remove the le128_gf128mul_x_ble function
  crypto: xts - drop gf128mul dependency

 arch/x86/crypto/camellia_glue.c   |  4 +--
 arch/x86/crypto/glue_helper.c |  3 +-
 arch/x86/crypto/serpent_sse2_glue.c   |  4 +--
 arch/x86/crypto/twofish_glue_3way.c   |  4 +--
 arch/x86/include/asm/crypto/glue_helper.h | 10 --
 crypto/Kconfig|  1 -
 crypto/gf128mul.c | 33 +--
 crypto/xts.c  | 38 ++---
 include/crypto/gf128mul.h | 55 +--
 include/crypto/xts.h  |  2 +-
 10 files changed, 82 insertions(+), 72 deletions(-)

-- 
2.9.3



[PATCH v5 1/4] crypto: gf128mul - define gf128mul_x_* in gf128mul.h

2017-04-02 Thread Ondrej Mosnacek
The gf128mul_x_ble function is currently defined in gf128mul.c, because
it depends on the gf128mul_table_be multiplication table.

However, since the function is very small and only uses two values from
the table, it is better for it to be defined as inline function in
gf128mul.h. That way, the function can be inlined by the compiler for
better performance.

For consistency, the other gf128mul_x_* functions are also moved to the
header file. In addition, the code is rewritten to be constant-time.

After this change, the speed of the generic 'xts(aes)' implementation
increased from ~225 MiB/s to ~235 MiB/s (measured using 'cryptsetup
benchmark -c aes-xts-plain64' on an Intel system with CRYPTO_AES_X86_64
and CRYPTO_AES_NI_INTEL disabled).

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
Cc: Eric Biggers <ebigg...@google.com>
---
 crypto/gf128mul.c | 33 +---
 include/crypto/gf128mul.h | 55 +--
 2 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 04facc0..dc01212 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -130,43 +130,12 @@ static const u16 gf128mul_table_le[256] = 
gf128mul_dat(xda_le);
 static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
 
 /*
- * The following functions multiply a field element by x or by x^8 in
+ * The following functions multiply a field element by x^8 in
  * the polynomial field representation.  They use 64-bit word operations
  * to gain speed but compensate for machine endianness and hence work
  * correctly on both styles of machine.
  */
 
-static void gf128mul_x_lle(be128 *r, const be128 *x)
-{
-   u64 a = be64_to_cpu(x->a);
-   u64 b = be64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_le[(b << 7) & 0xff];
-
-   r->b = cpu_to_be64((b >> 1) | (a << 63));
-   r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
-}
-
-static void gf128mul_x_bbe(be128 *r, const be128 *x)
-{
-   u64 a = be64_to_cpu(x->a);
-   u64 b = be64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_be[a >> 63];
-
-   r->a = cpu_to_be64((a << 1) | (b >> 63));
-   r->b = cpu_to_be64((b << 1) ^ _tt);
-}
-
-void gf128mul_x_ble(be128 *r, const be128 *x)
-{
-   u64 a = le64_to_cpu(x->a);
-   u64 b = le64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_be[b >> 63];
-
-   r->a = cpu_to_le64((a << 1) ^ _tt);
-   r->b = cpu_to_le64((b << 1) | (a >> 63));
-}
-EXPORT_SYMBOL(gf128mul_x_ble);
-
 static void gf128mul_x8_lle(be128 *x)
 {
u64 a = be64_to_cpu(x->a);
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
index 0bc9b5f..35ced9d 100644
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
@@ -49,6 +49,7 @@
 #ifndef _CRYPTO_GF128MUL_H
 #define _CRYPTO_GF128MUL_H
 
+#include 
 #include 
 #include 
 
@@ -163,8 +164,58 @@ void gf128mul_lle(be128 *a, const be128 *b);
 
 void gf128mul_bbe(be128 *a, const be128 *b);
 
-/* multiply by x in ble format, needed by XTS */
-void gf128mul_x_ble(be128 *a, const be128 *b);
+/*
+ * The following functions multiply a field element by x in
+ * the polynomial field representation.  They use 64-bit word operations
+ * to gain speed but compensate for machine endianness and hence work
+ * correctly on both styles of machine.
+ *
+ * They are defined here for performance.
+ */
+
+static inline u64 gf128mul_mask_from_bit(u64 x, int which)
+{
+   /* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 : 0' */
+   return ((s64)(x << (63 - which)) >> 63);
+}
+
+static inline void gf128mul_x_lle(be128 *r, const be128 *x)
+{
+   u64 a = be64_to_cpu(x->a);
+   u64 b = be64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_le[(b << 7) & 0xff] << 48
+* (see crypto/gf128mul.c): */
+   u64 _tt = gf128mul_mask_from_bit(b, 0) & ((u64)0xe1 << 56);
+
+   r->b = cpu_to_be64((b >> 1) | (a << 63));
+   r->a = cpu_to_be64((a >> 1) ^ _tt);
+}
+
+static inline void gf128mul_x_bbe(be128 *r, const be128 *x)
+{
+   u64 a = be64_to_cpu(x->a);
+   u64 b = be64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */
+   u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87;
+
+   r->a = cpu_to_be64((a << 1) | (b >> 63));
+   r->b = cpu_to_be64((b << 1) ^ _tt);
+}
+
+/* needed by XTS */
+static inline void gf128mul_x_ble(be128 *r, const be128 *x)
+{
+   u64 a = le64_to_cpu(x->a);
+   u64 b = le64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */
+   u64 _tt = gf128mul_mask_from_bit(b, 63) & 0x87;
+
+   r->a = cpu_to_le64((a << 1) ^ _tt);
+   r->b = cpu_to_le64((b << 1) | (a >> 63));
+}
 
 /* 4k table optimization */
 
-- 
2.9.3



[PATCH v5 2/4] crypto: gf128mul - switch gf128mul_x_ble to le128

2017-04-02 Thread Ondrej Mosnacek
Currently, gf128mul_x_ble works with pointers to be128, even though it
actually interprets the words as little-endian. Consequently, it uses
cpu_to_le64/le64_to_cpu on fields of type __be64, which is incorrect.

This patch fixes that by changing the function to accept pointers to
le128 and updating all users accordingly.

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
 arch/x86/crypto/camellia_glue.c |  4 ++--
 arch/x86/crypto/serpent_sse2_glue.c |  4 ++--
 arch/x86/crypto/twofish_glue_3way.c |  4 ++--
 crypto/xts.c| 38 ++---
 include/crypto/gf128mul.h   |  8 
 include/crypto/xts.h|  2 +-
 6 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index aa76cad..af4840a 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1522,7 +1522,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, 
struct scatterlist *dst,
   struct scatterlist *src, unsigned int nbytes)
 {
struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-   be128 buf[2 * 4];
+   le128 buf[2 * 4];
struct xts_crypt_req req = {
.tbuf = buf,
.tbuflen = sizeof(buf),
@@ -1540,7 +1540,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, 
struct scatterlist *dst,
   struct scatterlist *src, unsigned int nbytes)
 {
struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-   be128 buf[2 * 4];
+   le128 buf[2 * 4];
struct xts_crypt_req req = {
.tbuf = buf,
.tbuflen = sizeof(buf),
diff --git a/arch/x86/crypto/serpent_sse2_glue.c 
b/arch/x86/crypto/serpent_sse2_glue.c
index 644f97a..ac0e831 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -328,7 +328,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct 
scatterlist *dst,
   struct scatterlist *src, unsigned int nbytes)
 {
struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-   be128 buf[SERPENT_PARALLEL_BLOCKS];
+   le128 buf[SERPENT_PARALLEL_BLOCKS];
struct crypt_priv crypt_ctx = {
.ctx = >crypt_ctx,
.fpu_enabled = false,
@@ -355,7 +355,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct 
scatterlist *dst,
   struct scatterlist *src, unsigned int nbytes)
 {
struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-   be128 buf[SERPENT_PARALLEL_BLOCKS];
+   le128 buf[SERPENT_PARALLEL_BLOCKS];
struct crypt_priv crypt_ctx = {
.ctx = >crypt_ctx,
.fpu_enabled = false,
diff --git a/arch/x86/crypto/twofish_glue_3way.c 
b/arch/x86/crypto/twofish_glue_3way.c
index 2ebb5e9..243e90a 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -296,7 +296,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct 
scatterlist *dst,
   struct scatterlist *src, unsigned int nbytes)
 {
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-   be128 buf[3];
+   le128 buf[3];
struct xts_crypt_req req = {
.tbuf = buf,
.tbuflen = sizeof(buf),
@@ -314,7 +314,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct 
scatterlist *dst,
   struct scatterlist *src, unsigned int nbytes)
 {
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-   be128 buf[3];
+   le128 buf[3];
struct xts_crypt_req req = {
.tbuf = buf,
.tbuflen = sizeof(buf),
diff --git a/crypto/xts.c b/crypto/xts.c
index baeb34d..bd5065c 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -39,11 +39,11 @@ struct xts_instance_ctx {
 };
 
 struct rctx {
-   be128 buf[XTS_BUFFER_SIZE / sizeof(be128)];
+   le128 buf[XTS_BUFFER_SIZE / sizeof(le128)];
 
-   be128 t;
+   le128 t;
 
-   be128 *ext;
+   le128 *ext;
 
struct scatterlist srcbuf[2];
struct scatterlist dstbuf[2];
@@ -99,7 +99,7 @@ static int setkey(struct crypto_skcipher *parent, const u8 
*key,
 static int post_crypt(struct skcipher_request *req)
 {
struct rctx *rctx = skcipher_request_ctx(req);
-   be128 *buf = rctx->ext ?: rctx->buf;
+   le128 *buf = rctx->ext ?: rctx->buf;
struct skcipher_request *subreq;
const int bs = XTS_BLOCK_SIZE;
struct skcipher_walk w;
@@ -112,12 +112,12 @@ static int post_crypt(struct skcipher_request *req)
 
while (w.nbytes) {
unsigned int avail = w.nbytes;
-   be128 *wdst;
+   le128 *wdst;
 
wdst = w.dst.virt.addr;
 
do {
-

[PATCH v4] crypto: gf128mul - define gf128mul_x_* in gf128mul.h

2017-04-01 Thread Ondrej Mosnacek
The gf128mul_x_ble function is currently defined in gf128mul.c, because
it depends on the gf128mul_table_be multiplication table.

However, since the function is very small and only uses two values from
the table, it is better for it to be defined as inline function in
gf128mul.h. That way, the function can be inlined by the compiler for
better performance.

For consistency, the other gf128mul_x_* functions are also moved to the
header file. In addition, the code is rewritten to be constant-time.

After this change, the speed of the generic 'xts(aes)' implementation
increased from ~225 MiB/s to ~235 MiB/s (measured using 'cryptsetup
benchmark -c aes-xts-plain64' on an Intel system with CRYPTO_AES_X86_64
and CRYPTO_AES_NI_INTEL disabled).

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
Cc: Eric Biggers <ebigg...@google.com>
---
v3 -> v4: a faster version of gf128mul_x_lle
v2 -> v3: constant-time implementation
v1 -> v2: move all _x_ functions to the header, not just gf128mul_x_ble

 crypto/gf128mul.c | 33 +---
 include/crypto/gf128mul.h | 55 +--
 2 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 04facc0..dc01212 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -130,43 +130,12 @@ static const u16 gf128mul_table_le[256] = 
gf128mul_dat(xda_le);
 static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
 
 /*
- * The following functions multiply a field element by x or by x^8 in
+ * The following functions multiply a field element by x^8 in
  * the polynomial field representation.  They use 64-bit word operations
  * to gain speed but compensate for machine endianness and hence work
  * correctly on both styles of machine.
  */
 
-static void gf128mul_x_lle(be128 *r, const be128 *x)
-{
-   u64 a = be64_to_cpu(x->a);
-   u64 b = be64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_le[(b << 7) & 0xff];
-
-   r->b = cpu_to_be64((b >> 1) | (a << 63));
-   r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
-}
-
-static void gf128mul_x_bbe(be128 *r, const be128 *x)
-{
-   u64 a = be64_to_cpu(x->a);
-   u64 b = be64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_be[a >> 63];
-
-   r->a = cpu_to_be64((a << 1) | (b >> 63));
-   r->b = cpu_to_be64((b << 1) ^ _tt);
-}
-
-void gf128mul_x_ble(be128 *r, const be128 *x)
-{
-   u64 a = le64_to_cpu(x->a);
-   u64 b = le64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_be[b >> 63];
-
-   r->a = cpu_to_le64((a << 1) ^ _tt);
-   r->b = cpu_to_le64((b << 1) | (a >> 63));
-}
-EXPORT_SYMBOL(gf128mul_x_ble);
-
 static void gf128mul_x8_lle(be128 *x)
 {
u64 a = be64_to_cpu(x->a);
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
index 0bc9b5f..35ced9d 100644
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
@@ -49,6 +49,7 @@
 #ifndef _CRYPTO_GF128MUL_H
 #define _CRYPTO_GF128MUL_H
 
+#include 
 #include 
 #include 
 
@@ -163,8 +164,58 @@ void gf128mul_lle(be128 *a, const be128 *b);
 
 void gf128mul_bbe(be128 *a, const be128 *b);
 
-/* multiply by x in ble format, needed by XTS */
-void gf128mul_x_ble(be128 *a, const be128 *b);
+/*
+ * The following functions multiply a field element by x in
+ * the polynomial field representation.  They use 64-bit word operations
+ * to gain speed but compensate for machine endianness and hence work
+ * correctly on both styles of machine.
+ *
+ * They are defined here for performance.
+ */
+
+static inline u64 gf128mul_mask_from_bit(u64 x, int which)
+{
+   /* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 : 0' */
+   return ((s64)(x << (63 - which)) >> 63);
+}
+
+static inline void gf128mul_x_lle(be128 *r, const be128 *x)
+{
+   u64 a = be64_to_cpu(x->a);
+   u64 b = be64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_le[(b << 7) & 0xff] << 48
+* (see crypto/gf128mul.c): */
+   u64 _tt = gf128mul_mask_from_bit(b, 0) & ((u64)0xe1 << 56);
+
+   r->b = cpu_to_be64((b >> 1) | (a << 63));
+   r->a = cpu_to_be64((a >> 1) ^ _tt);
+}
+
+static inline void gf128mul_x_bbe(be128 *r, const be128 *x)
+{
+   u64 a = be64_to_cpu(x->a);
+   u64 b = be64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */
+   u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87;
+
+   r->a = cpu_to_be64((a << 1) | (b >> 63));
+   r->b = cpu_to_be64((b << 1) ^ _tt);
+}
+
+/* needed by XTS */
+static inline void gf128mul_x_ble(be128 *r, const be128 *x)
+{
+   u64 a = le64_to_cpu(x->a);
+   u64 b = le64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */
+   u64 _tt = gf128mul_mask_from_bit(b, 63) & 0x87;
+
+   r->a = cpu_to_le64((a << 1) ^ _tt);
+   r->b = cpu_to_le64((b << 1) | (a >> 63));
+}
 
 /* 4k table optimization */
 
-- 
2.9.3



[PATCH v3] crypto: gf128mul - define gf128mul_x_* in gf128mul.h

2017-03-31 Thread Ondrej Mosnacek
The gf128mul_x_ble function is currently defined in gf128mul.c, because
it depends on the gf128mul_table_be multiplication table.

However, since the function is very small and only uses two values from
the table, it is better for it to be defined as inline function in
gf128mul.h. That way, the function can be inlined by the compiler for
better performance.

For consistency, the other gf128mul_x_* functions are also moved to the
header file. In addition, the code is rewritten to be constant-time.

After this change, the speed of the generic 'xts(aes)' implementation
increased from ~225 MiB/s to ~235 MiB/s (measured using 'cryptsetup
benchmark -c aes-xts-plain64' on an Intel system with CRYPTO_AES_X86_64
and CRYPTO_AES_NI_INTEL disabled).

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
Cc: Eric Biggers <ebigg...@google.com>
---
v2 -> v3: constant-time implementation
v1 -> v2: move all _x_ functions to the header, not just gf128mul_x_ble

 crypto/gf128mul.c | 33 +---
 include/crypto/gf128mul.h | 55 +--
 2 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 04facc0..dc01212 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -130,43 +130,12 @@ static const u16 gf128mul_table_le[256] = 
gf128mul_dat(xda_le);
 static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
 
 /*
- * The following functions multiply a field element by x or by x^8 in
+ * The following functions multiply a field element by x^8 in
  * the polynomial field representation.  They use 64-bit word operations
  * to gain speed but compensate for machine endianness and hence work
  * correctly on both styles of machine.
  */
 
-static void gf128mul_x_lle(be128 *r, const be128 *x)
-{
-   u64 a = be64_to_cpu(x->a);
-   u64 b = be64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_le[(b << 7) & 0xff];
-
-   r->b = cpu_to_be64((b >> 1) | (a << 63));
-   r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
-}
-
-static void gf128mul_x_bbe(be128 *r, const be128 *x)
-{
-   u64 a = be64_to_cpu(x->a);
-   u64 b = be64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_be[a >> 63];
-
-   r->a = cpu_to_be64((a << 1) | (b >> 63));
-   r->b = cpu_to_be64((b << 1) ^ _tt);
-}
-
-void gf128mul_x_ble(be128 *r, const be128 *x)
-{
-   u64 a = le64_to_cpu(x->a);
-   u64 b = le64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_be[b >> 63];
-
-   r->a = cpu_to_le64((a << 1) ^ _tt);
-   r->b = cpu_to_le64((b << 1) | (a >> 63));
-}
-EXPORT_SYMBOL(gf128mul_x_ble);
-
 static void gf128mul_x8_lle(be128 *x)
 {
u64 a = be64_to_cpu(x->a);
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
index 0bc9b5f..6e43be5 100644
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
@@ -49,6 +49,7 @@
 #ifndef _CRYPTO_GF128MUL_H
 #define _CRYPTO_GF128MUL_H
 
+#include 
 #include 
 #include 
 
@@ -163,8 +164,58 @@ void gf128mul_lle(be128 *a, const be128 *b);
 
 void gf128mul_bbe(be128 *a, const be128 *b);
 
-/* multiply by x in ble format, needed by XTS */
-void gf128mul_x_ble(be128 *a, const be128 *b);
+/*
+ * The following functions multiply a field element by x in
+ * the polynomial field representation.  They use 64-bit word operations
+ * to gain speed but compensate for machine endianness and hence work
+ * correctly on both styles of machine.
+ *
+ * They are defined here for performance.
+ */
+
+static inline u64 gf128mul_mask_from_bit(u64 x, int which)
+{
+   /* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 : 0' */
+   return ((s64)(x << (63 - which)) >> 63);
+}
+
+static inline void gf128mul_x_lle(be128 *r, const be128 *x)
+{
+   u64 a = be64_to_cpu(x->a);
+   u64 b = be64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_le[(b << 7) & 0xff] >> 8
+* (see crypto/gf128mul.c): */
+   u64 _tt = gf128mul_mask_from_bit(b, 0) & 0xe1;
+
+   r->b = cpu_to_be64((b >> 1) | (a << 63));
+   r->a = cpu_to_be64((a >> 1) ^ (_tt << 56));
+}
+
+static inline void gf128mul_x_bbe(be128 *r, const be128 *x)
+{
+   u64 a = be64_to_cpu(x->a);
+   u64 b = be64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */
+   u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87;
+
+   r->a = cpu_to_be64((a << 1) | (b >> 63));
+   r->b = cpu_to_be64((b << 1) ^ _tt);
+}
+
+/* needed by XTS */
+static inline void gf128mul_x_ble(be128 *r, const be128 *x)
+{
+   u64 a = le64_to_cpu(x->a);
+   u64 b = le64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */
+   u64 _tt = gf128mul_mask_from_bit(b, 63) & 0x87;
+
+   r->a = cpu_to_le64((a << 1) ^ _tt);
+   r->b = cpu_to_le64((b << 1) | (a >> 63));
+}
 
 /* 4k table optimization */
 
-- 
2.9.3



[PATCH v2] crypto: gf128mul - define gf128mul_x_* in gf128mul.h

2017-03-30 Thread Ondrej Mosnacek
The gf128mul_x_ble function is currently defined in gf128mul.c, because
it depends on the gf128mul_table_be multiplication table.

However, since the function is very small and only uses two values from
the table, it is better for it to be defined as inline function in
gf128mul.h. That way, the function can be inlined by the compiler for
better performance.

For consistency, the other gf128mul_x_* functions are also moved to the
header file.

After this change, the speed of the generic 'xts(aes)' implementation
increased from ~225 MiB/s to ~235 MiB/s (measured using 'cryptsetup
benchmark -c aes-xts-plain64' on an Intel system with CRYPTO_AES_X86_64
and CRYPTO_AES_NI_INTEL disabled).

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
Cc: Eric Biggers <ebigg...@google.com>
---
 crypto/gf128mul.c | 33 +--
 include/crypto/gf128mul.h | 49 +--
 2 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 04facc0..dc01212 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -130,43 +130,12 @@ static const u16 gf128mul_table_le[256] = 
gf128mul_dat(xda_le);
 static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
 
 /*
- * The following functions multiply a field element by x or by x^8 in
+ * The following functions multiply a field element by x^8 in
  * the polynomial field representation.  They use 64-bit word operations
  * to gain speed but compensate for machine endianness and hence work
  * correctly on both styles of machine.
  */
 
-static void gf128mul_x_lle(be128 *r, const be128 *x)
-{
-   u64 a = be64_to_cpu(x->a);
-   u64 b = be64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_le[(b << 7) & 0xff];
-
-   r->b = cpu_to_be64((b >> 1) | (a << 63));
-   r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
-}
-
-static void gf128mul_x_bbe(be128 *r, const be128 *x)
-{
-   u64 a = be64_to_cpu(x->a);
-   u64 b = be64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_be[a >> 63];
-
-   r->a = cpu_to_be64((a << 1) | (b >> 63));
-   r->b = cpu_to_be64((b << 1) ^ _tt);
-}
-
-void gf128mul_x_ble(be128 *r, const be128 *x)
-{
-   u64 a = le64_to_cpu(x->a);
-   u64 b = le64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_be[b >> 63];
-
-   r->a = cpu_to_le64((a << 1) ^ _tt);
-   r->b = cpu_to_le64((b << 1) | (a >> 63));
-}
-EXPORT_SYMBOL(gf128mul_x_ble);
-
 static void gf128mul_x8_lle(be128 *x)
 {
u64 a = be64_to_cpu(x->a);
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
index 0bc9b5f..2a24553 100644
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
@@ -49,6 +49,7 @@
 #ifndef _CRYPTO_GF128MUL_H
 #define _CRYPTO_GF128MUL_H
 
+#include 
 #include 
 #include 
 
@@ -163,8 +164,52 @@ void gf128mul_lle(be128 *a, const be128 *b);
 
 void gf128mul_bbe(be128 *a, const be128 *b);
 
-/* multiply by x in ble format, needed by XTS */
-void gf128mul_x_ble(be128 *a, const be128 *b);
+/*
+ * The following functions multiply a field element by x in
+ * the polynomial field representation.  They use 64-bit word operations
+ * to gain speed but compensate for machine endianness and hence work
+ * correctly on both styles of machine.
+ *
+ * They are defined here for performance.
+ */
+
+static inline void gf128mul_x_lle(be128 *r, const be128 *x)
+{
+   u64 a = be64_to_cpu(x->a);
+   u64 b = be64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_le[(b << 7) & 0xff] >> 8
+* (see crypto/gf128mul.c): */
+   u64 _tt = (b & (u64)1) ? 0xe1 : 0x00;
+
+   r->b = cpu_to_be64((b >> 1) | (a << 63));
+   r->a = cpu_to_be64((a >> 1) ^ (_tt << 56));
+}
+
+static inline void gf128mul_x_bbe(be128 *r, const be128 *x)
+{
+   u64 a = be64_to_cpu(x->a);
+   u64 b = be64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */
+   u64 _tt = (a & ((u64)1 << 63)) ? 0x87 : 0x00;
+
+   r->a = cpu_to_be64((a << 1) | (b >> 63));
+   r->b = cpu_to_be64((b << 1) ^ _tt);
+}
+
+/* needed by XTS */
+static inline void gf128mul_x_ble(be128 *r, const be128 *x)
+{
+   u64 a = le64_to_cpu(x->a);
+   u64 b = le64_to_cpu(x->b);
+
+   /* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */
+   u64 _tt = (b & ((u64)1 << 63)) ? 0x87 : 0x00;
+
+   r->a = cpu_to_le64((a << 1) ^ _tt);
+   r->b = cpu_to_le64((b << 1) | (a >> 63));
+}
 
 /* 4k table optimization */
 
-- 
2.9.3



[PATCH] crypto: gf128mul - define gf128mul_x_ble in gf128mul.h

2017-03-30 Thread Ondrej Mosnacek
The gf128mul_x_ble function is currently defined in gf128mul.c, because
it depends on the gf128mul_table_be multiplication table.

However, since the function is very small and only uses two values from
the table, it is better for it to be defined as inline function in
gf128mul.h. That way, the function can be inlined by the compiler for
better performance.

After this change, the speed of the generic 'xts(aes)' implementation
increased from ~225 MiB/s to ~235 MiB/s (measured using 'cryptsetup
benchmark' on an Intel system with CRYPTO_AES_X86_64 and
CRYPTO_AES_NI_INTEL disabled).

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
 crypto/gf128mul.c | 11 ---
 include/crypto/gf128mul.h | 15 +--
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 04facc0..2eab1a1 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -156,17 +156,6 @@ static void gf128mul_x_bbe(be128 *r, const be128 *x)
r->b = cpu_to_be64((b << 1) ^ _tt);
 }
 
-void gf128mul_x_ble(be128 *r, const be128 *x)
-{
-   u64 a = le64_to_cpu(x->a);
-   u64 b = le64_to_cpu(x->b);
-   u64 _tt = gf128mul_table_be[b >> 63];
-
-   r->a = cpu_to_le64((a << 1) ^ _tt);
-   r->b = cpu_to_le64((b << 1) | (a >> 63));
-}
-EXPORT_SYMBOL(gf128mul_x_ble);
-
 static void gf128mul_x8_lle(be128 *x)
 {
u64 a = be64_to_cpu(x->a);
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
index 0bc9b5f..46a01a2 100644
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
@@ -49,6 +49,7 @@
 #ifndef _CRYPTO_GF128MUL_H
 #define _CRYPTO_GF128MUL_H
 
+#include 
 #include 
 #include 
 
@@ -163,8 +164,18 @@ void gf128mul_lle(be128 *a, const be128 *b);
 
 void gf128mul_bbe(be128 *a, const be128 *b);
 
-/* multiply by x in ble format, needed by XTS */
-void gf128mul_x_ble(be128 *a, const be128 *b);
+/* Multiply by x in ble format, needed by XTS.
+ * Defined here for performance. */
+static inline void gf128mul_x_ble(be128 *r, const be128 *x)
+{
+   u64 a = le64_to_cpu(x->a);
+   u64 b = le64_to_cpu(x->b);
+   /* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */
+   u64 _tt = (b & ((u64)1 << 63)) ? 0x87 : 0x00;
+
+   r->a = cpu_to_le64((a << 1) ^ _tt);
+   r->b = cpu_to_le64((b << 1) | (a >> 63));
+}
 
 /* 4k table optimization */
 
-- 
2.9.3



dm-crypt IV generation (summary)

2017-03-10 Thread Ondrej Mosnacek
Hi all,

I was tasked to post a summary the whole dm-crypt IV generation
problem and all the suggested solutions along with their drawbacks, so
here it goes...

PROBLEM STATEMENT:
Currently, dm-crypt uses a fixed 512-byte sector size and handles
en-/decrypting of a bio by submitting a separate request to the crypto
API for each sector. This causes a performance problem with some
crypto drivers that have a high processing overhead for small
requests, i.e. most crypto accelerators [1][2] and also e.g. the
AES-NI driver [3].

POSSIBLE SOLUTIONS:
1. Move IV generator algorithms to crypto API and allow them to
process the whole bio at once. ([5] or something equivalent)
ISSUES:
a) The 'keycount' parameter.
In order to support multi-key modes from Loop-AES,
dm-crypt accepts a keycount parameter which, if it != 1, causes
consecutive sectors to be encrypted with a different key. This
parameter can be specified with any of the cipher modes, which makes
porting the whole scale of modes supported by dm-crypt to crypto API
rather messy, since a lot of dm-crypt specific stuff needs to be moved
into the crypto drivers.
b) New AEAD functionality; random IV generator.
The soon-to-be-added AEAD functionality in dm-crypt
introduces a new IV generator that generates IVs randomly and stores
them as sector metadata. This means IV generation cannot be handled
solely in the driver. Also, additional AEAD implementation of IV
generators would be eventually needed.

2. Restrict the keycount parameter to allow only reasonable
combinations and then move IV generators to crypto API.
In Loop-AES, the multi-key mode always uses exactly 64 keys and
there is no reasonable scenario where someone would like to use
keycount != 1 with other IV mode than LMK. Thus it might be acceptable
to only work with multiple keys in LMK (and only allow keycount == 64
for it) and work with just one key in all other modes (and only allow
keycount == 1 for them). I.e., the cipher format would remain the
same, just with more restricted values.

Note that this would be basically a breaking change (the keycount
parameter is documented [4], so there may be users somewhere that use
it in some unusual combination...). Still, such combinations would
have to be used explicitly, as they are never used with common
LUKS/Loop-AES/TrueCrypt partitions (something like cryptsetup --type
plain ... or dmsetup would have to be used directly).

Applying this change would allow implementing the IV generators as
simple crypto algorithms that honor the usual interface (without
setkey hacks and passing of special structures via IV).

ISSUES:
a) Backward incompatibility (see above).
b) Again need to somehow handle the new 'random' IV generator.

3. Extend crypto API to allow passing multiple requests at once (each
with a separate IV) to crypto drivers.
(see [3])
ISSUES:
a) HW accelerators would have to specifically support this
scheme (unless they are somehow programmable).
I.e. accelerators that already have the plain64 IV
generator hard-coded could not fully benefit from this (I don't know
how many are already out there). However, future ones could implement
this 'universal' IV mode and enjoy basically the same benefit.

4. Implement support of 4KB sectors (or other sizes, too) in dm-crypt.
This would partially help, since on devices with 4K sectors the
size of each crypto request would then be 8x bigger and overhead would
be reduced without the need to mess with the dm-crypt IV generators.

ISSUES:
a) Only 4KB would be processed at once, not the whole bio (but
it may suffice).
b) Partitions encrypted as 4KB sectors could not be safely
moved to a 512B-sector device (writes would not be atomic).

QUESTIONS:
@Mike/dm-devel: Do you think it would be feasible to apply solution 2
(thus introducing the small incompatibility)?

REFERENCES:
[1] 
https://nelenkov.blogspot.com/2015/05/hardware-accelerated-disk-encryption-in.html
[2] https://lkml.org/lkml/2017/3/2/274
[3] https://www.mail-archive.com/linux-crypto@vger.kernel.org/msg23007.html
[4] https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt
[5] https://lkml.org/lkml/2017/2/7/182

Cheers,
Ondrej


Re: [RFC PATCH v4] IV Generation algorithms for dm-crypt

2017-03-01 Thread Ondrej Mosnacek
2017-03-01 13:42 GMT+01:00 Gilad Ben-Yossef :
> It really is an observation about overhead of context switches between
> dm-crypt and
> whatever/wherever you handle crypto - be it an off CPU hardware engine
> or a bunch
> of parallel kernel threads running on other cores. You really want to
> burst as much as
> possible.

[...]

>> For XTS you need just simple linear IV. No problem with that, implementation
>> in crypto API and hw is trivial.
>>
>> But for compatible IV (that provides compatibility with loopAES and very old 
>> TrueCrypt),
>> these should be never ever implemented again anywhere.
>
>>
>> Specifically "tcw" is broken, insecure and provided here just to help people 
>> to migrate
>> from old Truecrypt containers. Even Truecrypt followers removed it from the 
>> codebase.
>> (It is basically combination of IV and slight modification of CBC mode. All
>> recent version switched to XTS and plain IV.)
>>
>> So building abstraction over something known to be broken and that is now 
>> intentionally
>> isolated inside dmcrypt is, in my opinion, really not a good idea.
>>
>
> I don't think anyone is interested in these modes. How do you support
> XTS and essiv in
> a generic way without supporting this broken modes is not something
> I'm clear on though.

Wouldn't adopting a bulk request API (something like what I tried to
do here [1]) that allows users to supply multiple messages, each with
their own IV, fulfill this purpose? That way, we wouldn't need to
introduce any new modes into Crypto API and the drivers/accelerators
would only need to provide bulk implementations of common modes
(xts(aes), cbc(aes), ...) to provide better performance for dm-crypt
(and possibly other users, too).

I'm not sure how exactly these crypto accelerators work, but wouldn't
it help if the drivers simply get more messages (in our case sectors)
in a single call? I wonder, would (efficiently) supporting such a
scheme require changes in the HW itself or could it be achieved just
by modifying driver code (let's say specifically for your CryptoCell
accelerator)?

Thanks,
Ondrej

[1] https://www.mail-archive.com/linux-crypto@vger.kernel.org/msg23007.html

>
> Thanks,
> Gilad
>
>
>
> --
> Gilad Ben-Yossef
> Chief Coffee Drinker
>
> "If you take a class in large-scale robotics, can you end up in a
> situation where the homework eats your dog?"
>  -- Jean-Baptiste Queru


[RFC PATCH 6/6] dm-crypt: Add bulk crypto processing support

2017-01-12 Thread Ondrej Mosnacek
This patch converts dm-crypt to use bulk requests when invoking skcipher
operations, allowing the crypto drivers to process multiple sectors at once,
while reducing the overhead caused by the small sector size.

The new code detects if multiple sectors from a bio are contigously stored
within a single page (which should almost always be the case), and in such case
processes all these sectors via a single bulk request.

Note that the bio can also consist of several (likely consecutive) pages, which
could be all bundled in a single request. However, since we need to specify an
upper bound on how many sectors we are going to send at once (and this bound
may affect the amount of memory allocated per single request), it is best to
just limit the request bundling to a single page.

Note that if the 'keycount' parameter of the cipher specification is set to a
value other than 1, dm-crypt still sends only one sector in each request, since
in such case the neighboring sectors are encrypted with different keys.

This change causes a detectable read/write speedup (about 5-10%) on a ramdisk
when AES-NI accelerated ciphers are used.

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
 drivers/md/dm-crypt.c | 254 --
 1 file changed, 165 insertions(+), 89 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 7c6c572..d3f69e1 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -37,6 +37,9 @@
 
 #define DM_MSG_PREFIX "crypt"
 
+/* for now, we only bundle consecutve sectors within a single page */
+#define MAX_CONSEC_SECTORS (1 << (PAGE_SHIFT - SECTOR_SHIFT))
+
 /*
  * context holding the current state of a multi-part conversion
  */
@@ -48,7 +51,7 @@ struct convert_context {
struct bvec_iter iter_out;
sector_t cc_sector;
atomic_t cc_pending;
-   struct skcipher_request *req;
+   struct skcipher_bulk_request *req;
 };
 
 /*
@@ -73,6 +76,7 @@ struct dm_crypt_request {
struct scatterlist sg_in;
struct scatterlist sg_out;
sector_t iv_sector;
+   sector_t sector_count;
 };
 
 struct crypt_config;
@@ -83,9 +87,9 @@ struct crypt_iv_operations {
void (*dtr)(struct crypt_config *cc);
int (*init)(struct crypt_config *cc);
int (*wipe)(struct crypt_config *cc);
-   int (*generator)(struct crypt_config *cc, u8 *iv,
+   int (*generator)(struct crypt_config *cc, u8 *iv, unsigned int sector,
 struct dm_crypt_request *dmreq);
-   int (*post)(struct crypt_config *cc, u8 *iv,
+   int (*post)(struct crypt_config *cc, u8 *iv, unsigned int sector,
struct dm_crypt_request *dmreq);
 };
 
@@ -163,14 +167,14 @@ struct crypt_config {
/*
 * Layout of each crypto request:
 *
-*   struct skcipher_request
+*   struct skcipher_bulk_request
 *  context
 *  padding
 *   struct dm_crypt_request
 *  padding
-*   IV
+*   IVs
 *
-* The padding is added so that dm_crypt_request and the IV are
+* The padding is added so that dm_crypt_request and the IVs are
 * correctly aligned.
 */
unsigned int dmreq_start;
@@ -245,20 +249,24 @@ static struct crypto_skcipher *any_tfm(struct 
crypt_config *cc)
  * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
  */
 
-static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *ivs, unsigned int i,
  struct dm_crypt_request *dmreq)
 {
+   u8 *iv = ivs + i * cc->iv_size;
+
memset(iv, 0, cc->iv_size);
-   *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0x);
+   *(__le32 *)iv = cpu_to_le32((dmreq->iv_sector + i) & 0x);
 
return 0;
 }
 
-static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
-   struct dm_crypt_request *dmreq)
+static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *ivs,
+   unsigned int i, struct dm_crypt_request *dmreq)
 {
+   u8 *iv = ivs + i * cc->iv_size;
+
memset(iv, 0, cc->iv_size);
-   *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
+   *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector + i);
 
return 0;
 }
@@ -410,13 +418,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, 
struct dm_target *ti,
return err;
 }
 
-static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *ivs, unsigned int i,
  struct dm_crypt_request *dmreq)
 {
struct crypto_cipher *essiv_tfm = cc->iv_private;
+   u8 *iv = ivs + i * cc->iv_size;
 
memset(iv, 0, cc->iv_size);
-   *(__le64 *)iv = cpu_to_le64(dmreq->iv

[RFC PATCH 1/6] crypto: skcipher - Add bulk request processing API

2017-01-12 Thread Ondrej Mosnacek
This patch adds bulk request processing to the skcipher interface.
Specifically, it adds a new type of request ('skcipher_bulk_request'), which
allows passing multiple independent messages to the skcipher driver.

The buffers for the message data are passed via just two sg lists (one for src
buffer, one for dst buffer). The IVs are passed via a single buffer, where they
are stored sequentially. The interface allows specifying either a fixed length
for all messages or a pointer to an array of message lengths.

A skcipher implementation that wants to provide support for bulk requests may
set the appropriate fields of its skcipher_alg struct. If these fields are not
provided (or the skcipher is created from an (a)blkcipher), the crypto API
automatically sets these fields to a fallback implementation, which just splits
the bulk request into a series of regular skcipher requests on the same tfm.

This means that the new type of request can be used with all skciphers, even if
they do not support bulk requests natively.

Note that when allocating a skcipher_bulk_request, the user must specify the
maximum number of messages that they are going to submit via the request. This
is necessary for the fallback implementation, which has to allocate space for
the appropriate number of subrequests so that they can be processed in
parallel. If the skcipher is synchronous, then the fallback implementation
only allocates space for one subrequest and processes the patrial requests
sequentially.

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
 crypto/Makefile|   1 +
 crypto/skcipher.c  |  15 ++
 crypto/skcipher_bulk.c | 312 +
 include/crypto/internal/skcipher.h |  32 
 include/crypto/skcipher.h  | 299 ++-
 5 files changed, 658 insertions(+), 1 deletion(-)
 create mode 100644 crypto/skcipher_bulk.c

diff --git a/crypto/Makefile b/crypto/Makefile
index b8f0e3e..cd1cf57 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_CRYPTO_AEAD2) += aead.o
 crypto_blkcipher-y := ablkcipher.o
 crypto_blkcipher-y += blkcipher.o
 crypto_blkcipher-y += skcipher.o
+crypto_blkcipher-y += skcipher_bulk.o
 obj-$(CONFIG_CRYPTO_BLKCIPHER2) += crypto_blkcipher.o
 obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o
 obj-$(CONFIG_CRYPTO_ECHAINIV) += echainiv.o
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 6ee6a15..8b6d684 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -667,6 +667,8 @@ static int crypto_init_skcipher_ops_blkcipher(struct 
crypto_tfm *tfm)
skcipher->ivsize = crypto_blkcipher_ivsize(blkcipher);
skcipher->keysize = calg->cra_blkcipher.max_keysize;
 
+   crypto_skcipher_bulk_set_fallback(skcipher);
+
return 0;
 }
 
@@ -760,6 +762,8 @@ static int crypto_init_skcipher_ops_ablkcipher(struct 
crypto_tfm *tfm)
sizeof(struct ablkcipher_request);
skcipher->keysize = calg->cra_ablkcipher.max_keysize;
 
+   crypto_skcipher_bulk_set_fallback(skcipher);
+
return 0;
 }
 
@@ -789,6 +793,14 @@ static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm)
skcipher->ivsize = alg->ivsize;
skcipher->keysize = alg->max_keysize;
 
+   if (!alg->encrypt_bulk || !alg->decrypt_bulk || !alg->reqsize_bulk)
+   crypto_skcipher_bulk_set_fallback(skcipher);
+   else {
+   skcipher->encrypt_bulk = alg->encrypt_bulk;
+   skcipher->decrypt_bulk = alg->decrypt_bulk;
+   skcipher->reqsize_bulk = alg->reqsize_bulk;
+   }
+
if (alg->exit)
skcipher->base.exit = crypto_skcipher_exit_tfm;
 
@@ -822,6 +834,9 @@ static void crypto_skcipher_show(struct seq_file *m, struct 
crypto_alg *alg)
seq_printf(m, "ivsize   : %u\n", skcipher->ivsize);
seq_printf(m, "chunksize: %u\n", skcipher->chunksize);
seq_printf(m, "walksize : %u\n", skcipher->walksize);
+   seq_printf(m, "bulk : %s\n",
+  (skcipher->encrypt_bulk && skcipher->decrypt_bulk &&
+   skcipher->reqsize_bulk) ?  "yes" : "no");
 }
 
 #ifdef CONFIG_NET
diff --git a/crypto/skcipher_bulk.c b/crypto/skcipher_bulk.c
new file mode 100644
index 000..9630122
--- /dev/null
+++ b/crypto/skcipher_bulk.c
@@ -0,0 +1,312 @@
+/*
+ * Bulk IV fallback for skcipher.
+ *
+ * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2016-2017 Ondrej Mosnacek <omosna...@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ 

[RFC PATCH 4/6] crypto: simd - Add bulk request support

2017-01-12 Thread Ondrej Mosnacek
This patch adds proper support for the new bulk requests to the SIMD helpers.

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
 crypto/simd.c | 61 +++
 1 file changed, 61 insertions(+)

diff --git a/crypto/simd.c b/crypto/simd.c
index 8820337..2ae5930 100644
--- a/crypto/simd.c
+++ b/crypto/simd.c
@@ -100,6 +100,64 @@ static int simd_skcipher_decrypt(struct skcipher_request 
*req)
return crypto_skcipher_decrypt(subreq);
 }
 
+static int simd_skcipher_encrypt_bulk(struct skcipher_bulk_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct skcipher_bulk_request *subreq;
+   struct crypto_skcipher *child;
+
+   subreq = skcipher_bulk_request_ctx(req);
+   *subreq = *req;
+
+   if (!may_use_simd() ||
+   (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm)))
+   child = >cryptd_tfm->base;
+   else
+   child = cryptd_skcipher_child(ctx->cryptd_tfm);
+
+   skcipher_bulk_request_set_tfm(subreq, child);
+
+   return crypto_skcipher_encrypt_bulk(subreq);
+}
+
+static int simd_skcipher_decrypt_bulk(struct skcipher_bulk_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct skcipher_bulk_request *subreq;
+   struct crypto_skcipher *child;
+
+   subreq = skcipher_bulk_request_ctx(req);
+   *subreq = *req;
+
+   if (!may_use_simd() ||
+   (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm)))
+   child = >cryptd_tfm->base;
+   else
+   child = cryptd_skcipher_child(ctx->cryptd_tfm);
+
+   skcipher_bulk_request_set_tfm(subreq, child);
+
+   return crypto_skcipher_decrypt_bulk(subreq);
+}
+
+static unsigned int simd_skcipher_reqsize_bulk(struct crypto_skcipher *tfm,
+  unsigned int maxmsgs)
+{
+   struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct crypto_skcipher *tfm_cryptd, *tfm_child;
+   unsigned int reqsize_cryptd, reqsize_child;
+
+   tfm_cryptd = >cryptd_tfm->base;
+   tfm_child = cryptd_skcipher_child(ctx->cryptd_tfm);
+
+   reqsize_cryptd = crypto_skcipher_bulk_reqsize(tfm_cryptd, maxmsgs);
+   reqsize_child = crypto_skcipher_bulk_reqsize(tfm_child, maxmsgs);
+   return sizeof(struct skcipher_bulk_request) +
+   max(reqsize_cryptd, reqsize_child);
+}
+
 static void simd_skcipher_exit(struct crypto_skcipher *tfm)
 {
struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -187,6 +245,9 @@ struct simd_skcipher_alg *simd_skcipher_create_compat(const 
char *algname,
alg->setkey = simd_skcipher_setkey;
alg->encrypt = simd_skcipher_encrypt;
alg->decrypt = simd_skcipher_decrypt;
+   alg->encrypt_bulk = simd_skcipher_encrypt_bulk;
+   alg->decrypt_bulk = simd_skcipher_decrypt_bulk;
+   alg->reqsize_bulk = simd_skcipher_reqsize_bulk;
 
err = crypto_register_skcipher(alg);
if (err)
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 3/6] crypto: cryptd - Add skcipher bulk request support

2017-01-12 Thread Ondrej Mosnacek
This patch adds proper support for the new bulk requests to cryptd.

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
 crypto/cryptd.c | 111 
 1 file changed, 111 insertions(+)

diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index 0508c48..b7d6e13 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -555,6 +555,114 @@ static int cryptd_skcipher_decrypt_enqueue(struct 
skcipher_request *req)
return cryptd_skcipher_enqueue(req, cryptd_skcipher_decrypt);
 }
 
+static void cryptd_skcipher_bulk_complete(struct skcipher_bulk_request *req,
+ int err)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct cryptd_skcipher_request_ctx *rctx =
+   skcipher_bulk_request_ctx(req);
+   int refcnt = atomic_read(>refcnt);
+
+   local_bh_disable();
+   rctx->complete(>base, err);
+   local_bh_enable();
+
+   if (err != -EINPROGRESS && refcnt && atomic_dec_and_test(>refcnt))
+   crypto_free_skcipher(tfm);
+}
+
+static void cryptd_skcipher_bulk_encrypt(struct crypto_async_request *base,
+int err)
+{
+   struct skcipher_bulk_request *req = skcipher_bulk_request_cast(base);
+   struct cryptd_skcipher_request_ctx *rctx =
+   skcipher_bulk_request_ctx(req);
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct crypto_skcipher *child = ctx->child;
+   SKCIPHER_BULK_REQUEST_ON_STACK(subreq, req->maxmsgs, child);
+
+   if (unlikely(err == -EINPROGRESS))
+   goto out;
+
+   skcipher_bulk_request_set_tfm(subreq, child);
+   skcipher_bulk_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP,
+  NULL, NULL);
+   skcipher_bulk_request_set_crypt(subreq, req->src, req->dst, req->nmsgs,
+   req->msgsize, req->msgsizes, req->ivs);
+
+   err = crypto_skcipher_encrypt_bulk(subreq);
+   skcipher_bulk_request_zero(subreq);
+
+   req->base.complete = rctx->complete;
+
+out:
+   cryptd_skcipher_bulk_complete(req, err);
+}
+
+static void cryptd_skcipher_bulk_decrypt(struct crypto_async_request *base,
+int err)
+{
+   struct skcipher_bulk_request *req = skcipher_bulk_request_cast(base);
+   struct cryptd_skcipher_request_ctx *rctx =
+   skcipher_bulk_request_ctx(req);
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct crypto_skcipher *child = ctx->child;
+   SKCIPHER_BULK_REQUEST_ON_STACK(subreq, req->maxmsgs, child);
+
+   if (unlikely(err == -EINPROGRESS))
+   goto out;
+
+   skcipher_bulk_request_set_tfm(subreq, child);
+   skcipher_bulk_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP,
+  NULL, NULL);
+   skcipher_bulk_request_set_crypt(subreq, req->src, req->dst, req->nmsgs,
+   req->msgsize, req->msgsizes, req->ivs);
+
+   err = crypto_skcipher_decrypt_bulk(subreq);
+   skcipher_bulk_request_zero(subreq);
+
+   req->base.complete = rctx->complete;
+
+out:
+   cryptd_skcipher_bulk_complete(req, err);
+}
+
+static int cryptd_skcipher_bulk_enqueue(struct skcipher_bulk_request *req,
+   crypto_completion_t compl)
+{
+   struct cryptd_skcipher_request_ctx *rctx =
+   skcipher_bulk_request_ctx(req);
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct cryptd_queue *queue;
+
+   queue = cryptd_get_queue(crypto_skcipher_tfm(tfm));
+   rctx->complete = req->base.complete;
+   req->base.complete = compl;
+
+   return cryptd_enqueue_request(queue, >base);
+}
+
+static int cryptd_skcipher_bulk_encrypt_enqueue(
+   struct skcipher_bulk_request *req)
+{
+   return cryptd_skcipher_bulk_enqueue(req, cryptd_skcipher_bulk_encrypt);
+}
+
+static int cryptd_skcipher_bulk_decrypt_enqueue(
+   struct skcipher_bulk_request *req)
+{
+   return cryptd_skcipher_bulk_enqueue(req, cryptd_skcipher_bulk_decrypt);
+}
+
+static unsigned int cryptd_skcipher_bulk_reqsize(struct crypto_skcipher *tfm,
+unsigned int maxmsgs)
+{
+   return sizeof(struct cryptd_skcipher_request_ctx);
+}
+
 static int cryptd_skcipher_init_tfm(struct crypto_skcipher *tfm)
 {
struct skcipher_instance *inst = skcipher_alg_instance(tfm);
@@ 

[RFC PATCH 0/6] Add bulk skcipher requests to crypto API and dm-crypt

2017-01-12 Thread Ondrej Mosnacek
Hi,

the goal of this patchset is to allow those skcipher API users that need to
process batches of small messages (especially dm-crypt) to do so efficiently.

The first patch introduces a new request type (and corresponding encrypt/decrypt
functions) to the skcipher API. The new API can be used to submit multiple
messages at once, thus enabling the drivers to reduce overhead as opposed to
processing each message separately.

The skcipher drivers can provide support for the new request type by setting the
corresponding fields of their skcipher_alg structure. If 'native' support is not
provided by a driver (i.e. the fields are left NULL), the crypto API
transparently provides a generic fallback implementation, which simply processes
the bulk request as a set of standard requests on the same tfm.

The second patch extends skcipher_walk so it can be used for processing the new
bulk requests, while preserving equivalent functionality when used with standard
requests.

The third and fourth patches add native bulk request support to the cryptd and
SIMD helper wrappers, respectively.

The fifth patch adds bulk request support to the AES-NI skcipher drivers, in
order to provide an example for both implementing the bulk request processing
and the usage of the extended skcipher_walk in such implementation. Also, this
patch provides a slight optimization, since the kernel_fpu_* functions are
called just once per the whole bulk request. Note that both the standard and
bulk implementation mostly use the same code under the hood.

The last patch converts dm-crypt to use bulk requests and makes it submit
multiple sectors at once, whenever they are stored sequentially within a single
page.

With all the patches applied, I was able to measure a small speedup (~5-10%)
with AES-NI ciphers and dm-crypt device mapped over a ramdisk.

To-be-done:
testing the bulk API in testmgr.c
documentation update

Ondrej Mosnacek (6):
  crypto: skcipher - Add bulk request processing API
  crypto: skcipher - Add bulk request support to walk
  crypto: cryptd - Add skcipher bulk request support
  crypto: simd - Add bulk request support
  crypto: aesni-intel - Add bulk request support
  dm-crypt: Add bulk crypto processing support

 arch/x86/crypto/aesni-intel_glue.c| 267 +++--
 arch/x86/crypto/glue_helper.c |  23 +--
 arch/x86/include/asm/crypto/glue_helper.h |   2 +-
 crypto/Makefile   |   1 +
 crypto/cryptd.c   | 111 +++
 crypto/simd.c |  61 ++
 crypto/skcipher.c | 207 +++-
 crypto/skcipher_bulk.c| 312 ++
 drivers/md/dm-crypt.c | 254 +++-
 include/crypto/internal/skcipher.h|  42 +++-
 include/crypto/skcipher.h | 299 +++-
 11 files changed, 1369 insertions(+), 210 deletions(-)
 create mode 100644 crypto/skcipher_bulk.c

-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 5/6] crypto: aesni-intel - Add bulk request support

2017-01-12 Thread Ondrej Mosnacek
This patch implements bulk request handling in the AES-NI crypto drivers.
The major advantage of this is that with bulk requests, the kernel_fpu_*
functions (which are usually quite slow) are now called only once for the whole
request.

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
 arch/x86/crypto/aesni-intel_glue.c| 267 +++---
 arch/x86/crypto/glue_helper.c |  23 ++-
 arch/x86/include/asm/crypto/glue_helper.h |   2 +-
 3 files changed, 221 insertions(+), 71 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_glue.c 
b/arch/x86/crypto/aesni-intel_glue.c
index 36ca150..5f67afc 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -364,70 +364,116 @@ static int aesni_skcipher_setkey(struct crypto_skcipher 
*tfm, const u8 *key,
  crypto_skcipher_ctx(tfm), key, len);
 }
 
-static int ecb_encrypt(struct skcipher_request *req)
+typedef void (*aesni_crypt_t)(struct crypto_aes_ctx *ctx,
+ u8 *out, const u8 *in, unsigned int len);
+
+typedef void (*aesni_ivcrypt_t)(struct crypto_aes_ctx *ctx,
+   u8 *out, const u8 *in, unsigned int len,
+   u8 *iv);
+
+static int ecb_crypt(struct crypto_aes_ctx *ctx, struct skcipher_walk *walk,
+aesni_crypt_t crypt)
 {
-   struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-   struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
-   struct skcipher_walk walk;
unsigned int nbytes;
int err;
 
-   err = skcipher_walk_virt(, req, true);
-
kernel_fpu_begin();
-   while ((nbytes = walk.nbytes)) {
-   aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK);
+   while ((nbytes = walk->nbytes)) {
+   crypt(ctx, walk->dst.virt.addr, walk->src.virt.addr,
+ nbytes & AES_BLOCK_MASK);
nbytes &= AES_BLOCK_SIZE - 1;
-   err = skcipher_walk_done(, nbytes);
+   err = skcipher_walk_done(walk, nbytes);
}
kernel_fpu_end();
 
return err;
 }
 
-static int ecb_decrypt(struct skcipher_request *req)
+static int cbc_crypt(struct crypto_aes_ctx *ctx, struct skcipher_walk *walk,
+aesni_ivcrypt_t crypt)
 {
-   struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-   struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
-   struct skcipher_walk walk;
unsigned int nbytes;
int err;
 
-   err = skcipher_walk_virt(, req, true);
-
kernel_fpu_begin();
-   while ((nbytes = walk.nbytes)) {
-   aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK);
+   while ((nbytes = walk->nbytes)) {
+   crypt(ctx, walk->dst.virt.addr, walk->src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk->iv);
nbytes &= AES_BLOCK_SIZE - 1;
-   err = skcipher_walk_done(, nbytes);
+   err = skcipher_walk_done(walk, nbytes);
}
kernel_fpu_end();
 
return err;
 }
 
-static int cbc_encrypt(struct skcipher_request *req)
+static int ecb_encrypt(struct skcipher_request *req)
 {
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
struct skcipher_walk walk;
-   unsigned int nbytes;
int err;
 
err = skcipher_walk_virt(, req, true);
+   if (err)
+   return err;
 
-   kernel_fpu_begin();
-   while ((nbytes = walk.nbytes)) {
-   aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK, walk.iv);
-   nbytes &= AES_BLOCK_SIZE - 1;
-   err = skcipher_walk_done(, nbytes);
-   }
-   kernel_fpu_end();
+   return ecb_crypt(ctx, , aesni_ecb_enc);
+}
 
-   return err;
+static int ecb_decrypt(struct skcipher_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+   struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+   struct skcipher_walk walk;
+   int err;
+
+   err = skcipher_walk_virt(, req, true);
+   if (err)
+   return err;
+
+   return ecb_crypt(ctx, , aesni_ecb_dec);
+}
+
+static int ecb_encrypt_bulk(struct skcipher_bulk_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+   struct skcipher_walk walk;
+   int err;
+
+   err = skcipher_walk_virt_bulk(, req, true);
+   if (err)
+   return err;
+
+   return ecb_crypt(ctx, , aesni_ecb_enc);
+}
+
+st

[RFC PATCH 2/6] crypto: skcipher - Add bulk request support to walk

2017-01-12 Thread Ondrej Mosnacek
This patch tweaks skcipher_walk so it can be used with the new bulk requests.

The new skipher_walk can be initialized either from a skcipher_request (in
which case its behavior is equivalent to the old code) or from a
skcipher_bulk_request, in which case the usage is almost identical, the most
significant exception being that skciphers which somehow tweak the IV
(e.g. XTS) must check the new nextmsg flag before processing each chunk and
re-tweak the IV if it is set. For other ciphers skcipher_walk automatically
switches to the next IV at message boundaries.

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
 crypto/skcipher.c  | 192 +++--
 include/crypto/internal/skcipher.h |  10 +-
 2 files changed, 153 insertions(+), 49 deletions(-)

diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 8b6d684..b810e90 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -33,6 +33,7 @@ enum {
SKCIPHER_WALK_COPY = 1 << 2,
SKCIPHER_WALK_DIFF = 1 << 3,
SKCIPHER_WALK_SLEEP = 1 << 4,
+   SKCIPHER_WALK_HETEROGENOUS = 1 << 5,
 };
 
 struct skcipher_walk_buffer {
@@ -94,6 +95,41 @@ static inline u8 *skcipher_get_spot(u8 *start, unsigned int 
len)
return max(start, end_page);
 }
 
+static int skcipher_copy_iv(struct skcipher_walk *walk)
+{
+   unsigned a = crypto_tfm_ctx_alignment() - 1;
+   unsigned alignmask = walk->alignmask;
+   unsigned ivsize = walk->ivsize;
+   unsigned bs = walk->stride;
+   unsigned aligned_bs;
+   unsigned size;
+   u8 *iv;
+
+   aligned_bs = ALIGN(bs, alignmask);
+
+   /* Minimum size to align buffer by alignmask. */
+   size = alignmask & ~a;
+
+   if (walk->flags & SKCIPHER_WALK_PHYS)
+   size += ivsize;
+   else {
+   size += aligned_bs + ivsize;
+
+   /* Minimum size to ensure buffer does not straddle a page. */
+   size += (bs - 1) & ~(alignmask | a);
+   }
+
+   walk->buffer = kmalloc(size, skcipher_walk_gfp(walk));
+   if (!walk->buffer)
+   return -ENOMEM;
+
+   iv = PTR_ALIGN(walk->buffer, alignmask + 1);
+   iv = skcipher_get_spot(iv, bs) + aligned_bs;
+
+   walk->iv = memcpy(iv, walk->iv, walk->ivsize);
+   return 0;
+}
+
 static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
 {
u8 *addr;
@@ -108,9 +144,12 @@ static int skcipher_done_slow(struct skcipher_walk *walk, 
unsigned int bsize)
 int skcipher_walk_done(struct skcipher_walk *walk, int err)
 {
unsigned int n = walk->nbytes - err;
-   unsigned int nbytes;
+   unsigned int nbytes, nbytes_msg;
+
+   walk->nextmsg = false; /* reset the nextmsg flag */
 
nbytes = walk->total - n;
+   nbytes_msg = walk->total_msg - n;
 
if (unlikely(err < 0)) {
nbytes = 0;
@@ -139,8 +178,31 @@ int skcipher_walk_done(struct skcipher_walk *walk, int err)
if (err > 0)
err = 0;
 
+   if (nbytes && !nbytes_msg) {
+   walk->nextmsg = true;
+
+   /* write the output IV: */
+   if (walk->iv != walk->oiv)
+   memcpy(walk->oiv, walk->iv, walk->ivsize);
+
+   /* advance to the IV of next message: */
+   walk->oiv += walk->ivsize;
+   walk->iv = walk->oiv;
+
+   if (unlikely(((unsigned long)walk->iv & walk->alignmask))) {
+   err = skcipher_copy_iv(walk);
+   if (err)
+   return err;
+   }
+
+   nbytes_msg = *walk->nextmsgsize;
+   if (walk->flags & SKCIPHER_WALK_HETEROGENOUS)
+   ++walk->nextmsgsize;
+   }
+
+   walk->nbytes = nbytes_msg;
+   walk->total_msg = nbytes_msg;
walk->total = nbytes;
-   walk->nbytes = nbytes;
 
scatterwalk_advance(>in, n);
scatterwalk_advance(>out, n);
@@ -343,13 +405,13 @@ static int skcipher_walk_next(struct skcipher_walk *walk)
walk->flags &= ~(SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY |
 SKCIPHER_WALK_DIFF);
 
-   n = walk->total;
+   n = walk->total_msg;
bsize = min(walk->stride, max(n, walk->blocksize));
n = scatterwalk_clamp(>in, n);
n = scatterwalk_clamp(>out, n);
 
if (unlikely(n < bsize)) {
-   if (unlikely(walk->total < walk->blocksize))
+   if (unlikely(walk->total_msg < walk->blocksize))
return skcipher_walk_done(walk, -EINVAL);
 
 slow_path:
@@ -388,41 +450,6 @@ static int skcipher_walk_next(struct skcipher_walk *walk)
 }
 EXPORT_SYMBOL_GPL(skcipher_walk_next);
 
-static int skciph

[PATCH v2] crypto: gcm - Fix IV buffer size in crypto_gcm_setkey

2016-09-23 Thread Ondrej Mosnacek
The cipher block size for GCM is 16 bytes, and thus the CTR transform
used in crypto_gcm_setkey() will also expect a 16-byte IV. However,
the code currently reserves only 8 bytes for the IV, causing
an out-of-bounds access in the CTR transform. This patch fixes
the issue by setting the size of the IV buffer to 16 bytes.

Fixes: 84c911523020 ("[CRYPTO] gcm: Add support for async ciphers")
Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
(Sorry for previous broken patch, hopefully I got it right this
time.)

I randomly noticed this while going over igcm.c for an unrelated
reason. It seems the wrong buffer size never caused any noticeable
problems (it's been there since 2007), but it should be corrected
nonetheless...

 crypto/gcm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/gcm.c b/crypto/gcm.c
index 70a892e8..f624ac9 100644
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -117,7 +117,7 @@ static int crypto_gcm_setkey(struct crypto_aead *aead, 
const u8 *key,
struct crypto_skcipher *ctr = ctx->ctr;
struct {
be128 hash;
-   u8 iv[8];
+   u8 iv[16];
 
struct crypto_gcm_setkey_result result;
 
--
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html