[PATCH] crypto: arm/speck - fix building in Thumb2 mode

2018-06-18 Thread Eric Biggers
Building the kernel with CONFIG_THUMB2_KERNEL=y and
CONFIG_CRYPTO_SPECK_NEON set fails with the following errors:

arch/arm/crypto/speck-neon-core.S: Assembler messages:

arch/arm/crypto/speck-neon-core.S:419: Error: r13 not allowed here -- `bic 
sp,#0xf'
arch/arm/crypto/speck-neon-core.S:423: Error: r13 not allowed here -- `bic 
sp,#0xf'
arch/arm/crypto/speck-neon-core.S:427: Error: r13 not allowed here -- `bic 
sp,#0xf'
arch/arm/crypto/speck-neon-core.S:431: Error: r13 not allowed here -- `bic 
sp,#0xf'

The problem is that the 'bic' instruction can't operate on the 'sp'
register in Thumb2 mode.  Fix it by using a temporary register.  This
isn't in the main loop, so the performance difference is negligible.
This also matches what aes-neonbs-core.S does.

Reported-by: Stefan Agner 
Fixes: ede9622162fa ("crypto: arm/speck - add NEON-accelerated implementation 
of Speck-XTS")
Signed-off-by: Eric Biggers 
---
 arch/arm/crypto/speck-neon-core.S | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm/crypto/speck-neon-core.S 
b/arch/arm/crypto/speck-neon-core.S
index 3c1e203e53b9..57caa742016e 100644
--- a/arch/arm/crypto/speck-neon-core.S
+++ b/arch/arm/crypto/speck-neon-core.S
@@ -272,9 +272,11 @@
 * Allocate stack space to store 128 bytes worth of tweaks.  For
 * performance, this space is aligned to a 16-byte boundary so that we
 * can use the load/store instructions that declare 16-byte alignment.
+* For Thumb2 compatibility, don't do the 'bic' directly on 'sp'.
 */
-   sub sp, #128
-   bic sp, #0xf
+   sub r12, sp, #128
+   bic r12, #0xf
+   mov sp, r12
 
 .if \n == 64
// Load first tweak
-- 
2.18.0.rc1.244.gcf134e6275-goog



Re: [PATCH v3 3/5] crypto: arm/speck - add NEON-accelerated implementation of Speck-XTS

2018-06-18 Thread Ard Biesheuvel
On 18 June 2018 at 23:56, Eric Biggers  wrote:
> On Sun, Jun 17, 2018 at 01:10:41PM +0200, Ard Biesheuvel wrote:
>> > +
>> > + // One-time XTS preparation
>> > +
>> > + /*
>> > +  * Allocate stack space to store 128 bytes worth of tweaks.  For
>> > +  * performance, this space is aligned to a 16-byte boundary so 
>> > that we
>> > +  * can use the load/store instructions that declare 16-byte 
>> > alignment.
>> > +  */
>> > + sub sp, #128
>> > + bic sp, #0xf
>> 
>> 
>>  This fails here when building with CONFIG_THUMB2_KERNEL=y
>> 
>>    AS  arch/arm/crypto/speck-neon-core.o
>> 
>>  arch/arm/crypto/speck-neon-core.S: Assembler messages:
>> 
>>  arch/arm/crypto/speck-neon-core.S:419: Error: r13 not allowed here --
>>  `bic sp,#0xf'
>>  arch/arm/crypto/speck-neon-core.S:423: Error: r13 not allowed here --
>>  `bic sp,#0xf'
>>  arch/arm/crypto/speck-neon-core.S:427: Error: r13 not allowed here --
>>  `bic sp,#0xf'
>>  arch/arm/crypto/speck-neon-core.S:431: Error: r13 not allowed here --
>>  `bic sp,#0xf'
>> 
>>  In a quick hack this change seems to address it:
>> 
>> 
>>  -   sub sp, #128
>>  -   bic sp, #0xf
>>  +   mov r6, sp
>>  +   sub r6, #128
>>  +   bic r6, #0xf
>>  +   mov sp, r6
>> 
>>  But there is probably a better solution to address this.
>> 
>> >>>
>> >>> Given that there is no NEON on M class cores, I recommend we put 
>> >>> something like
>> >>>
>> >>> THUMB(bx pc)
>> >>> THUMB(nop.w)
>> >>> THUMB(.arm)
>> >>>
>> >>> at the beginning and be done with it.
>> >>
>> >> I mean nop.n or just nop, of course, and we may need a '.align 2' at
>> >> the beginning as well.
>> >
>> > Wouldn't it be preferable to have it assemble it in Thumb2 too? It seems
>> > that bic sp,#0xf is the only issue...
>> >
>>
>> Well, in general, yes. In the case of NEON code, not really, since the
>> resulting code will not be smaller anyway, because the Thumb2 NEON
>> opcodes are all 4 bytes. Also, Thumb2-only cores don't have NEON
>> units, so all cores that this code can run on will be able to run in
>> ARM mode.
>>
>> So from a maintainability pov, having code that only assembles in one
>> way is better than having code that must compile both to ARM and to
>> Thumb2 opcodes.
>>
>> Just my 2 cents, anyway.
>
> I don't have too much of a preference, though Stefan's suggested 4 
> instructions
> can be reduced to 3, which also matches what aes-neonbs-core.S does:
>
> sub r12, sp, #128
> bic r12, #0xf
> mov sp, r12
>
> Ard, is the following what you're suggesting instead?
>

Yes, but after looking at the actual code, I prefer the change above.
The access occurs only once, not in the loop so the additional
instructions should not affect performance.

Apologies for the noise.

> diff --git a/arch/arm/crypto/speck-neon-core.S 
> b/arch/arm/crypto/speck-neon-core.S
> index 3c1e203e53b9..c989ce3dc057 100644
> --- a/arch/arm/crypto/speck-neon-core.S
> +++ b/arch/arm/crypto/speck-neon-core.S
> @@ -8,6 +8,7 @@
>   */
>
>  #include 
> +#include 
>
> .text
> .fpuneon
> @@ -233,6 +234,12 @@
>   * nonzero multiple of 128.
>   */
>  .macro _speck_xts_cryptn, decrypting
> +
> +   .align  2
> +   THUMB(bx pc)
> +   THUMB(nop)
> +   THUMB(.arm)
> +
> push{r4-r7}
> mov r7, sp
>
> @@ -413,6 +420,8 @@
> mov sp, r7
> pop {r4-r7}
> bx  lr
> +
> +   THUMB(.thumb)
>  .endm
>
>  ENTRY(speck128_xts_encrypt_neon)


Re: [PATCH v3 3/5] crypto: arm/speck - add NEON-accelerated implementation of Speck-XTS

2018-06-18 Thread Eric Biggers
On Sun, Jun 17, 2018 at 01:10:41PM +0200, Ard Biesheuvel wrote:
> > +
> > + // One-time XTS preparation
> > +
> > + /*
> > +  * Allocate stack space to store 128 bytes worth of tweaks.  For
> > +  * performance, this space is aligned to a 16-byte boundary so 
> > that we
> > +  * can use the load/store instructions that declare 16-byte 
> > alignment.
> > +  */
> > + sub sp, #128
> > + bic sp, #0xf
> 
> 
>  This fails here when building with CONFIG_THUMB2_KERNEL=y
> 
>    AS  arch/arm/crypto/speck-neon-core.o
> 
>  arch/arm/crypto/speck-neon-core.S: Assembler messages:
> 
>  arch/arm/crypto/speck-neon-core.S:419: Error: r13 not allowed here --
>  `bic sp,#0xf'
>  arch/arm/crypto/speck-neon-core.S:423: Error: r13 not allowed here --
>  `bic sp,#0xf'
>  arch/arm/crypto/speck-neon-core.S:427: Error: r13 not allowed here --
>  `bic sp,#0xf'
>  arch/arm/crypto/speck-neon-core.S:431: Error: r13 not allowed here --
>  `bic sp,#0xf'
> 
>  In a quick hack this change seems to address it:
> 
> 
>  -   sub sp, #128
>  -   bic sp, #0xf
>  +   mov r6, sp
>  +   sub r6, #128
>  +   bic r6, #0xf
>  +   mov sp, r6
> 
>  But there is probably a better solution to address this.
> 
> >>>
> >>> Given that there is no NEON on M class cores, I recommend we put 
> >>> something like
> >>>
> >>> THUMB(bx pc)
> >>> THUMB(nop.w)
> >>> THUMB(.arm)
> >>>
> >>> at the beginning and be done with it.
> >>
> >> I mean nop.n or just nop, of course, and we may need a '.align 2' at
> >> the beginning as well.
> >
> > Wouldn't it be preferable to have it assemble it in Thumb2 too? It seems
> > that bic sp,#0xf is the only issue...
> >
> 
> Well, in general, yes. In the case of NEON code, not really, since the
> resulting code will not be smaller anyway, because the Thumb2 NEON
> opcodes are all 4 bytes. Also, Thumb2-only cores don't have NEON
> units, so all cores that this code can run on will be able to run in
> ARM mode.
> 
> So from a maintainability pov, having code that only assembles in one
> way is better than having code that must compile both to ARM and to
> Thumb2 opcodes.
> 
> Just my 2 cents, anyway.

I don't have too much of a preference, though Stefan's suggested 4 instructions
can be reduced to 3, which also matches what aes-neonbs-core.S does:

sub r12, sp, #128
bic r12, #0xf
mov sp, r12

Ard, is the following what you're suggesting instead?

diff --git a/arch/arm/crypto/speck-neon-core.S 
b/arch/arm/crypto/speck-neon-core.S
index 3c1e203e53b9..c989ce3dc057 100644
--- a/arch/arm/crypto/speck-neon-core.S
+++ b/arch/arm/crypto/speck-neon-core.S
@@ -8,6 +8,7 @@
  */
 
 #include 
+#include 
 
.text
.fpuneon
@@ -233,6 +234,12 @@
  * nonzero multiple of 128.
  */
 .macro _speck_xts_cryptn, decrypting
+
+   .align  2
+   THUMB(bx pc)
+   THUMB(nop)
+   THUMB(.arm)
+
push{r4-r7}
mov r7, sp
 
@@ -413,6 +420,8 @@
mov sp, r7
pop {r4-r7}
bx  lr
+
+   THUMB(.thumb)
 .endm
 
 ENTRY(speck128_xts_encrypt_neon)


[PATCH 4/4] crypto: vmac - remove insecure version with hardcoded nonce

2018-06-18 Thread Eric Biggers
From: Eric Biggers 

Remove the original version of the VMAC template that had the nonce
hardcoded to 0 and produced a digest with the wrong endianness.  I'm
unsure whether this had users or not (there are no explicit in-kernel
references to it), but given that the hardcoded nonce made it wildly
insecure unless a unique key was used for each message, let's try
removing it and see if anyone complains.

Leave the new "vmac64" template that requires the nonce to be explicitly
specified as the first 16 bytes of data and uses the correct endianness
for the digest.

Signed-off-by: Eric Biggers 
---
 crypto/tcrypt.c  |   2 +-
 crypto/testmgr.c |   6 ---
 crypto/testmgr.h | 102 ---
 crypto/vmac.c|  84 --
 4 files changed, 8 insertions(+), 186 deletions(-)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index d5bcdd905007..078ec36007bf 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1939,7 +1939,7 @@ static int do_test(const char *alg, u32 type, u32 mask, 
int m, u32 num_mb)
break;
 
case 109:
-   ret += tcrypt_test("vmac(aes)");
+   ret += tcrypt_test("vmac64(aes)");
break;
 
case 111:
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 60a557b0f8d3..63f263fd1dae 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3477,12 +3477,6 @@ static const struct alg_test_desc alg_test_descs[] = {
.suite = {
.hash = __VECS(tgr192_tv_template)
}
-   }, {
-   .alg = "vmac(aes)",
-   .test = alg_test_hash,
-   .suite = {
-   .hash = __VECS(aes_vmac128_tv_template)
-   }
}, {
.alg = "vmac64(aes)",
.test = alg_test_hash,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 7b022c47a623..b6362169771a 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -4603,108 +4603,6 @@ static const struct hash_testvec 
aes_xcbc128_tv_template[] = {
}
 };
 
-static const char vmac_string1[128] = {'\x01', '\x01', '\x01', '\x01',
-  '\x02', '\x03', '\x02', '\x02',
-  '\x02', '\x04', '\x01', '\x07',
-  '\x04', '\x01', '\x04', '\x03',};
-static const char vmac_string2[128] = {'a', 'b', 'c',};
-static const char vmac_string3[128] = {'a', 'b', 'c', 'a', 'b', 'c',
-  'a', 'b', 'c', 'a', 'b', 'c',
-  'a', 'b', 'c', 'a', 'b', 'c',
-  'a', 'b', 'c', 'a', 'b', 'c',
-  'a', 'b', 'c', 'a', 'b', 'c',
-  'a', 'b', 'c', 'a', 'b', 'c',
-  'a', 'b', 'c', 'a', 'b', 'c',
-  'a', 'b', 'c', 'a', 'b', 'c',
- };
-
-static const char vmac_string4[17] = {'b', 'c', 'e', 'f',
- 'i', 'j', 'l', 'm',
- 'o', 'p', 'r', 's',
- 't', 'u', 'w', 'x', 'z'};
-
-static const char vmac_string5[127] = {'r', 'm', 'b', 't', 'c',
-  'o', 'l', 'k', ']', '%',
-  '9', '2', '7', '!', 'A'};
-
-static const char vmac_string6[129] = {'p', 't', '*', '7', 'l',
-  'i', '!', '#', 'w', '0',
-  'z', '/', '4', 'A', 'n'};
-
-static const struct hash_testvec aes_vmac128_tv_template[] = {
-   {
-   .key= "\x00\x01\x02\x03\x04\x05\x06\x07"
- "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
-   .plaintext = NULL,
-   .digest = "\x07\x58\x80\x35\x77\xa4\x7b\x54",
-   .psize  = 0,
-   .ksize  = 16,
-   }, {
-   .key= "\x00\x01\x02\x03\x04\x05\x06\x07"
- "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
-   .plaintext = vmac_string1,
-   .digest = "\xce\xf5\x3c\xd3\xae\x68\x8c\xa1",
-   .psize  = 128,
-   .ksize  = 16,
-   }, {
-   .key= "\x00\x01\x02\x03\x04\x05\x06\x07"
- "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
-   .plaintext = vmac_string2,
-   .digest = "\xc9\x27\xb0\x73\x81\xbd\x14\x2d",
-   .psize  = 128,
-   .ksize  = 16,
-   }, {
-   .key= "\x00\x01\x02\x03\x04\x05\x06\x07"
- "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
-   .plaintext = vmac_string3,
-   .digest = "\x8d\x1a\x95\x8c\x98\x47\x0b\x19",
-   .psize  = 128,
-   .ksize  = 16,
-   }, {
-   .key= "abcdefghijklmnop",
-  

[PATCH 0/4] crypto: vmac - various fixes

2018-06-18 Thread Eric Biggers
From: Eric Biggers 

Hi, this series fixes various bugs in the VMAC template (crypto/vmac.c).
First, the per-request context was being stored in the transform
context, which made VMAC not thread-safe, and the kernel could be
crashed by using the same VMAC transform in multiple threads using
AF_ALG (found by syzkaller).  Also the keys were incorrectly being wiped
after each message.  Patch 2 fixes these bugs, Cc'ed to stable.

But there are also bugs that require breaking changes: the nonce is
hardcoded to 0, and the endianness of the final digest is wrong.  So
patch 3 introduces a fixed version of the VMAC template that takes the
nonce as the first 16 bytes of data, and fixes the digest endianness.

Patch 4 then removes the current version of the VMAC template.  I'm not
100% sure whether we can really do that or not as it may have users
(there are no explicit users in the kernel, though), but given that the
old version was insecure unless a unique key was set for each message, I
think we should try and see if anyone complains.

Eric Biggers (4):
  crypto: vmac - require a block cipher with 128-bit block size
  crypto: vmac - separate tfm and request context
  crypto: vmac - add nonced version with big endian digest
  crypto: vmac - remove insecure version with hardcoded nonce

 crypto/tcrypt.c   |   2 +-
 crypto/testmgr.c  |   4 +-
 crypto/testmgr.h  | 217 +
 crypto/vmac.c | 444 --
 include/crypto/vmac.h |  63 --
 5 files changed, 351 insertions(+), 379 deletions(-)
 delete mode 100644 include/crypto/vmac.h

-- 
2.18.0.rc1.244.gcf134e6275-goog



[PATCH 1/4] crypto: vmac - require a block cipher with 128-bit block size

2018-06-18 Thread Eric Biggers
From: Eric Biggers 

The VMAC template assumes the block cipher has a 128-bit block size, but
it failed to check for that.  Thus it was possible to instantiate it
using a 64-bit block size cipher, e.g. "vmac(cast5)", causing
uninitialized memory to be used.

Add the needed check when instantiating the template.

Fixes: f1939f7c5645 ("crypto: vmac - New hash algorithm for intel_txt support")
Cc:  # v2.6.32+
Signed-off-by: Eric Biggers 
---
 crypto/vmac.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/crypto/vmac.c b/crypto/vmac.c
index df76a816cfb2..3034454a3713 100644
--- a/crypto/vmac.c
+++ b/crypto/vmac.c
@@ -655,6 +655,10 @@ static int vmac_create(struct crypto_template *tmpl, 
struct rtattr **tb)
if (IS_ERR(alg))
return PTR_ERR(alg);
 
+   err = -EINVAL;
+   if (alg->cra_blocksize != 16)
+   goto out_put_alg;
+
inst = shash_alloc_instance("vmac", alg);
err = PTR_ERR(inst);
if (IS_ERR(inst))
-- 
2.18.0.rc1.244.gcf134e6275-goog



[PATCH 3/4] crypto: vmac - add nonced version with big endian digest

2018-06-18 Thread Eric Biggers
From: Eric Biggers 

Currently the VMAC template uses a "nonce" hardcoded to 0, which makes
it insecure unless a unique key is set for every message.  Also, the
endianness of the final digest is wrong: the implementation uses little
endian, but the VMAC specification has it as big endian, as do other
VMAC implementations such as the one in Crypto++.

Add a new VMAC template where the nonce is passed as the first 16 bytes
of data (similar to what is done for Poly1305's nonce), and the digest
is big endian.  Call it "vmac64", since the old name of simply "vmac"
didn't clarify whether the implementation is of VMAC-64 or of VMAC-128
(which produce 64-bit and 128-bit digests respectively); so we fix the
naming ambiguity too.

Signed-off-by: Eric Biggers 
---
 crypto/testmgr.c |   6 ++
 crypto/testmgr.h | 155 +++
 crypto/vmac.c| 130 +--
 3 files changed, 273 insertions(+), 18 deletions(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 11e45352fd0b..60a557b0f8d3 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3483,6 +3483,12 @@ static const struct alg_test_desc alg_test_descs[] = {
.suite = {
.hash = __VECS(aes_vmac128_tv_template)
}
+   }, {
+   .alg = "vmac64(aes)",
+   .test = alg_test_hash,
+   .suite = {
+   .hash = __VECS(vmac64_aes_tv_template)
+   }
}, {
.alg = "wp256",
.test = alg_test_hash,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index b950aa234e43..7b022c47a623 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -4705,6 +4705,161 @@ static const struct hash_testvec 
aes_vmac128_tv_template[] = {
},
 };
 
+static const char vmac64_string1[144] = {
+   '\0', '\0',   '\0',   '\0',   '\0',   '\0',   '\0',   '\0',
+   '\0', '\0',   '\0',   '\0',   '\0',   '\0',   '\0',   '\0',
+   '\x01', '\x01', '\x01', '\x01', '\x02', '\x03', '\x02', '\x02',
+   '\x02', '\x04', '\x01', '\x07', '\x04', '\x01', '\x04', '\x03',
+};
+
+static const char vmac64_string2[144] = {
+   '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+   '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+'a',  'b',  'c',
+};
+
+static const char vmac64_string3[144] = {
+   '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+   '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+'a',  'b',  'c',  'a',  'b',  'c',  'a',  'b',
+'c',  'a',  'b',  'c',  'a',  'b',  'c',  'a',
+'b',  'c',  'a',  'b',  'c',  'a',  'b',  'c',
+'a',  'b',  'c',  'a',  'b',  'c',  'a',  'b',
+'c',  'a',  'b',  'c',  'a',  'b',  'c',  'a',
+'b',  'c',  'a',  'b',  'c',  'a',  'b',  'c',
+};
+
+static const char vmac64_string4[33] = {
+   '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+   '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+   'b',   'c',  'e',  'f',  'i',  'j',  'l',  'm',
+   'o',   'p',  'r',  's',  't',  'u',  'w',  'x',
+   'z',
+};
+
+static const char vmac64_string5[143] = {
+   '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+   '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+'r',  'm',  'b',  't',  'c',  'o',  'l',  'k',
+']',  '%',  '9',  '2',  '7',  '!',  'A',
+};
+
+static const char vmac64_string6[145] = {
+   '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+   '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+'p',  't',  '*',  '7',  'l',  'i',  '!',  '#',
+'w',  '0',  'z',  '/',  '4',  'A',  'n',
+};
+
+static const struct hash_testvec vmac64_aes_tv_template[] = {
+   { /* draft-krovetz-vmac-01 test vector 1 */
+   .key= "abcdefghijklmnop",
+   .ksize  = 16,
+   .plaintext = "\0\0\0\0\0\0\0\0bcdefghi",
+   .psize  = 16,
+   .digest = "\x25\x76\xbe\x1c\x56\xd8\xb8\x1b",
+   }, { /* draft-krovetz-vmac-01 test vector 2 */
+   .key= "abcdefghijklmnop",
+   .ksize  = 16,
+   .plaintext = "\0\0\0\0\0\0\0\0bcdefghiabc",
+   .psize  = 19,
+   .digest = "\x2d\x37\x6c\xf5\xb1\x81\x3c\xe5",
+   }, { /* draft-krovetz-vmac-01 test vector 3 */
+   .key= "abcdefghijklmnop",
+   .ksize  = 16,
+   .plaintext = "\0\0\0\0\0\0\0\0bcdefghi"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc",
+   .psize  = 64,
+   .digest = "\xe8\x42\x1f\x61\xd5\x73\xd2\x98",
+   }, { /* draft-krovetz-vmac-01 test vector 4 */
+   .key= "abcdefghijklmnop",
+   .ksize  = 16,
+   .plaintext = "\0\0\0\0\0\0\0\0bcdefghi"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+  

[PATCH 2/4] crypto: vmac - separate tfm and request context

2018-06-18 Thread Eric Biggers
From: Eric Biggers 

syzbot reported a crash in vmac_final() when multiple threads
concurrently use the same "vmac(aes)" transform through AF_ALG.  The bug
is pretty fundamental: the VMAC template doesn't separate per-request
state from per-tfm (per-key) state like the other hash algorithms do,
but rather stores it all in the tfm context.  That's wrong.

Also, vmac_final() incorrectly zeroes most of the state including the
derived keys and cached pseudorandom pad.  Therefore, only the first
VMAC invocation with a given key calculates the correct digest.

Fix these bugs by splitting the per-tfm state from the per-request state
and using the proper init/update/final sequencing for requests.

Reproducer for the crash:

#include 
#include 
#include 

int main()
{
int fd;
struct sockaddr_alg addr = {
.salg_type = "hash",
.salg_name = "vmac(aes)",
};
char buf[256] = { 0 };

fd = socket(AF_ALG, SOCK_SEQPACKET, 0);
bind(fd, (void *), sizeof(addr));
setsockopt(fd, SOL_ALG, ALG_SET_KEY, buf, 16);
fork();
fd = accept(fd, NULL, NULL);
for (;;)
write(fd, buf, 256);
}

The immediate cause of the crash is that vmac_ctx_t.partial_size exceeds
VMAC_NHBYTES, causing vmac_final() to memset() a negative length.

Reported-by: syzbot+264bca3a6e8d64555...@syzkaller.appspotmail.com
Fixes: f1939f7c5645 ("crypto: vmac - New hash algorithm for intel_txt support")
Cc:  # v2.6.32+
Signed-off-by: Eric Biggers 
---
 crypto/vmac.c | 408 +++---
 include/crypto/vmac.h |  63 ---
 2 files changed, 181 insertions(+), 290 deletions(-)
 delete mode 100644 include/crypto/vmac.h

diff --git a/crypto/vmac.c b/crypto/vmac.c
index 3034454a3713..bb2fc787d615 100644
--- a/crypto/vmac.c
+++ b/crypto/vmac.c
@@ -1,6 +1,10 @@
 /*
- * Modified to interface to the Linux kernel
+ * VMAC: Message Authentication Code using Universal Hashing
+ *
+ * Reference: https://tools.ietf.org/html/draft-krovetz-vmac-01
+ *
  * Copyright (c) 2009, Intel Corporation.
+ * Copyright (c) 2018, Google Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -16,14 +20,15 @@
  * Place - Suite 330, Boston, MA 02111-1307 USA.
  */
 
-/* --
- * VMAC and VHASH Implementation by Ted Krovetz (t...@acm.org) and Wei Dai.
- * This implementation is herby placed in the public domain.
- * The authors offers no warranty. Use at your own risk.
- * Please send bug reports to the authors.
- * Last modified: 17 APR 08, 1700 PDT
- * --- */
+/*
+ * Derived from:
+ * VMAC and VHASH Implementation by Ted Krovetz (t...@acm.org) and Wei Dai.
+ * This implementation is herby placed in the public domain.
+ * The authors offers no warranty. Use at your own risk.
+ * Last modified: 17 APR 08, 1700 PDT
+ */
 
+#include 
 #include 
 #include 
 #include 
@@ -31,9 +36,35 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
+/*
+ * User definable settings.
+ */
+#define VMAC_TAG_LEN   64
+#define VMAC_KEY_SIZE  128/* Must be 128, 192 or 256   */
+#define VMAC_KEY_LEN   (VMAC_KEY_SIZE/8)
+#define VMAC_NHBYTES   128/* Must 2^i for any 3 < i < 13 Standard = 128*/
+
+/* per-transform (per-key) context */
+struct vmac_tfm_ctx {
+   struct crypto_cipher *cipher;
+   u64 nhkey[(VMAC_NHBYTES/8)+2*(VMAC_TAG_LEN/64-1)];
+   u64 polykey[2*VMAC_TAG_LEN/64];
+   u64 l3key[2*VMAC_TAG_LEN/64];
+};
+
+/* per-request context */
+struct vmac_desc_ctx {
+   union {
+   u8 partial[VMAC_NHBYTES];   /* partial block */
+   __le64 partial_words[VMAC_NHBYTES / 8];
+   };
+   unsigned int partial_size;  /* size of the partial block */
+   bool first_block_processed;
+   u64 polytmp[2*VMAC_TAG_LEN/64]; /* running total of L2-hash */
+};
+
 /*
  * Constants and masks
  */
@@ -318,13 +349,6 @@ static void poly_step_func(u64 *ahi, u64 *alo,
} while (0)
 #endif
 
-static void vhash_abort(struct vmac_ctx *ctx)
-{
-   ctx->polytmp[0] = ctx->polykey[0] ;
-   ctx->polytmp[1] = ctx->polykey[1] ;
-   ctx->first_block_processed = 0;
-}
-
 static u64 l3hash(u64 p1, u64 p2, u64 k1, u64 k2, u64 len)
 {
u64 rh, rl, t, z = 0;
@@ -364,280 +388,209 @@ static u64 l3hash(u64 p1, u64 p2, u64 k1, u64 k2, u64 
len)
return rl;
 }
 
-static void vhash_update(const unsigned char *m,
-   unsigned int mbytes, /* Pos multiple of VMAC_NHBYTES */
-   struct vmac_ctx *ctx)
+/* L1 and L2-hash one or more VMAC_NHBYTES-byte blocks */
+static void vhash_blocks(const struct vmac_tfm_ctx *tctx,
+