[PATCH] crypto: chacha20 - Fix chacha20_block() keystream alignment (again)

2018-09-11 Thread Eric Biggers
From: Eric Biggers 

In commit 9f480faec58c ("crypto: chacha20 - Fix keystream alignment for
chacha20_block()"), I had missed that chacha20_block() can be called
directly on the buffer passed to get_random_bytes(), which can have any
alignment.  So, while my commit didn't break anything, it didn't fully
solve the alignment problems.

Revert my solution and just update chacha20_block() to use
put_unaligned_le32(), so the output buffer need not be aligned.
This is simpler, and on many CPUs it's the same speed.

But, I kept the 'tmp' buffers in extract_crng_user() and
_get_random_bytes() 4-byte aligned, since that alignment is actually
needed for _crng_backtrack_protect() too.

Reported-by: Stephan Müller 
Cc: Theodore Ts'o 
Signed-off-by: Eric Biggers 
---
 crypto/chacha20_generic.c |  7 ---
 drivers/char/random.c | 24 
 include/crypto/chacha20.h |  3 +--
 lib/chacha20.c|  6 +++---
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
index e451c3cb6a56..3ae96587caf9 100644
--- a/crypto/chacha20_generic.c
+++ b/crypto/chacha20_generic.c
@@ -18,20 +18,21 @@
 static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
 unsigned int bytes)
 {
-   u32 stream[CHACHA20_BLOCK_WORDS];
+   /* aligned to potentially speed up crypto_xor() */
+   u8 stream[CHACHA20_BLOCK_SIZE] __aligned(sizeof(long));
 
if (dst != src)
memcpy(dst, src, bytes);
 
while (bytes >= CHACHA20_BLOCK_SIZE) {
chacha20_block(state, stream);
-   crypto_xor(dst, (const u8 *)stream, CHACHA20_BLOCK_SIZE);
+   crypto_xor(dst, stream, CHACHA20_BLOCK_SIZE);
bytes -= CHACHA20_BLOCK_SIZE;
dst += CHACHA20_BLOCK_SIZE;
}
if (bytes) {
chacha20_block(state, stream);
-   crypto_xor(dst, (const u8 *)stream, bytes);
+   crypto_xor(dst, stream, bytes);
}
 }
 
diff --git a/drivers/char/random.c b/drivers/char/random.c
index bf5f99fc36f1..d22d967c50f0 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -433,9 +433,9 @@ static int crng_init_cnt = 0;
 static unsigned long crng_global_init_time = 0;
 #define CRNG_INIT_CNT_THRESH (2*CHACHA20_KEY_SIZE)
 static void _extract_crng(struct crng_state *crng,
- __u32 out[CHACHA20_BLOCK_WORDS]);
+ __u8 out[CHACHA20_BLOCK_SIZE]);
 static void _crng_backtrack_protect(struct crng_state *crng,
-   __u32 tmp[CHACHA20_BLOCK_WORDS], int used);
+   __u8 tmp[CHACHA20_BLOCK_SIZE], int used);
 static void process_random_ready_list(void);
 static void _get_random_bytes(void *buf, int nbytes);
 
@@ -921,7 +921,7 @@ static void crng_reseed(struct crng_state *crng, struct 
entropy_store *r)
unsigned long   flags;
int i, num;
union {
-   __u32   block[CHACHA20_BLOCK_WORDS];
+   __u8block[CHACHA20_BLOCK_SIZE];
__u32   key[8];
} buf;
 
@@ -968,7 +968,7 @@ static void crng_reseed(struct crng_state *crng, struct 
entropy_store *r)
 }
 
 static void _extract_crng(struct crng_state *crng,
- __u32 out[CHACHA20_BLOCK_WORDS])
+ __u8 out[CHACHA20_BLOCK_SIZE])
 {
unsigned long v, flags;
 
@@ -985,7 +985,7 @@ static void _extract_crng(struct crng_state *crng,
spin_unlock_irqrestore(>lock, flags);
 }
 
-static void extract_crng(__u32 out[CHACHA20_BLOCK_WORDS])
+static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
 {
struct crng_state *crng = NULL;
 
@@ -1003,7 +1003,7 @@ static void extract_crng(__u32 out[CHACHA20_BLOCK_WORDS])
  * enough) to mutate the CRNG key to provide backtracking protection.
  */
 static void _crng_backtrack_protect(struct crng_state *crng,
-   __u32 tmp[CHACHA20_BLOCK_WORDS], int used)
+   __u8 tmp[CHACHA20_BLOCK_SIZE], int used)
 {
unsigned long   flags;
__u32   *s, *d;
@@ -1015,14 +1015,14 @@ static void _crng_backtrack_protect(struct crng_state 
*crng,
used = 0;
}
spin_lock_irqsave(>lock, flags);
-   s = [used / sizeof(__u32)];
+   s = (__u32 *) [used];
d = >state[4];
for (i=0; i < 8; i++)
*d++ ^= *s++;
spin_unlock_irqrestore(>lock, flags);
 }
 
-static void crng_backtrack_protect(__u32 tmp[CHACHA20_BLOCK_WORDS], int used)
+static void crng_backtrack_protect(__u8 tmp[CHACHA20_BLOCK_SIZE], int used)
 {
struct crng_state *crng = NULL;
 
@@ -1038,7 +1038,7 @@ static void crng_backtrack_protect(__u32 
tmp[CHACHA20_BLOCK_WORDS], int used)
 static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
 {
ssize_t ret = 0, i = CHACHA20_BLOCK_SIZE;
-

Re: random: ensure use of aligned buffers with ChaCha20

2018-09-11 Thread Eric Biggers
To revive this...

On Fri, Aug 10, 2018 at 08:27:58AM +0200, Stephan Mueller wrote:
> Am Donnerstag, 9. August 2018, 21:40:12 CEST schrieb Eric Biggers:
> 
> Hi Eric,
> 
> > while (bytes >= CHACHA20_BLOCK_SIZE) {
> > chacha20_block(state, stream);
> > -   crypto_xor(dst, (const u8 *)stream, CHACHA20_BLOCK_SIZE);
> > +   crypto_xor(dst, stream, CHACHA20_BLOCK_SIZE);
> 
> If we are at it, I am wondering whether we should use crypto_xor. At this 
> point we exactly know that the data is CHACHA20_BLOCK_SIZE bytes in length 
> which is divisible by u32. Hence, shouldn't we disregard crypto_xor in favor 
> of a loop iterating in 32 bits words? crypto_xor contains some checks for 
> trailing bytes which we could spare.

crypto_xor() here is fine.  It already meets the conditions for the inlined
version that XOR's a long at a time:

if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
__builtin_constant_p(size) &&
(size % sizeof(unsigned long)) == 0) {
unsigned long *d = (unsigned long *)dst;
unsigned long *s = (unsigned long *)src;

while (size > 0) {
*d++ ^= *s++;
size -= sizeof(unsigned long);
}
}

And regardless, it's better to optimize crypto_xor() once, than all the callers.

> 
> > bytes -= CHACHA20_BLOCK_SIZE;
> > dst += CHACHA20_BLOCK_SIZE;
> > }
> > if (bytes) {
> > chacha20_block(state, stream);
> > -   crypto_xor(dst, (const u8 *)stream, bytes);
> > +   crypto_xor(dst, stream, bytes);
> 
> Same here.

'bytes' need not be a multiple of sizeof(u32) or sizeof(long), and 'dst' can
have any alignment...  So we should just call crypto_xor() which does the right
thing, and is intended to do so as efficiently as possible.

> 
> > @@ -1006,14 +1006,14 @@ static void _crng_backtrack_protect(struct
> > crng_state *crng, used = 0;
> > }
> > spin_lock_irqsave(>lock, flags);
> > -   s = [used / sizeof(__u32)];
> > +   s = (__u32 *) [used];
> 
> As Yann said, wouldn't you have the alignment problem here again?
> 
> Somehow, somebody must check the provided input buffer at one time.
> 

I guess we should just explicitly align the 'tmp' buffers in _get_random_bytes()
and extract_crng_user().

- Eric