Re: sha512: make it work, undo percpu message schedule

2012-01-13 Thread Eric Dumazet
Le vendredi 13 janvier 2012 à 18:08 +1100, Herbert Xu a écrit :
 On Fri, Jan 13, 2012 at 02:55:14AM +0300, Alexey Dobriyan wrote:
 
  Herbert, I couldn't come up with a single scenario. :-(
  But the bug is easy to reproduce.
 
 OK, does this patch work for you?
 
 commit 31f4e55c09c1170f8b813c14b1299b70f50db414
 Author: Herbert Xu herb...@gondor.apana.org.au
 Date:   Fri Jan 13 18:06:50 2012 +1100
 
 crypto: sha512 - Fix msg_schedule race
 
 The percpu msg_schedule setup was unsafe as a user in a process
 context can be interrupted by a softirq user which would then
 scribble over the exact same work area.  This was discovered by
 Steffen Klassert.
 
 This patch based on ideas from Eric Dumazet fixes this by using
 two independent work areas.
 
 Reported-by: Alexey Dobriyan adobri...@gmail.com
 Signed-off-by: Herbert Xu herb...@gondor.apana.org.au
 

I wonder ...

With 4096 cpus, do we really want to reserve 5242880 bytes of memory for
this function ?

What about following patch instead ?

(Trying a dynamic memory allocation, and fallback on a single
pre-allocated bloc of memory, shared by all cpus, protected by a
spinlock)

diff --git a/crypto/sha512_generic.c b/crypto/sha512_generic.c
index 9ed9f60..5c80a76 100644
--- a/crypto/sha512_generic.c
+++ b/crypto/sha512_generic.c
@@ -21,7 +21,6 @@
 #include linux/percpu.h
 #include asm/byteorder.h
 
-static DEFINE_PER_CPU(u64[80], msg_schedule);
 
 static inline u64 Ch(u64 x, u64 y, u64 z)
 {
@@ -87,10 +86,16 @@ static void
 sha512_transform(u64 *state, const u8 *input)
 {
u64 a, b, c, d, e, f, g, h, t1, t2;
-
+   static u64 msg_schedule[80];
+   static DEFINE_SPINLOCK(msg_schedule_lock);
int i;
-   u64 *W = get_cpu_var(msg_schedule);
+   u64 *W = kzalloc(sizeof(msg_schedule), GFP_ATOMIC | __GFP_NOWARN);
 
+   
+   if (!W) {
+   spin_lock_bh(msg_schedule_lock);
+   W = msg_schedule;
+   }
/* load the input */
 for (i = 0; i  16; i++)
 LOAD_OP(i, W, input);
@@ -128,8 +133,11 @@ sha512_transform(u64 *state, const u8 *input)
 
/* erase our data */
a = b = c = d = e = f = g = h = t1 = t2 = 0;
-   memset(W, 0, sizeof(__get_cpu_var(msg_schedule)));
-   put_cpu_var(msg_schedule);
+   memset(W, 0, sizeof(msg_schedule));
+   if (W == msg_schedule)
+   spin_unlock_bh(msg_schedule_lock);
+   else
+   kfree(W);
 }
 
 static int


--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: sha512: make it work, undo percpu message schedule

2012-01-13 Thread Eric Dumazet
Le vendredi 13 janvier 2012 à 11:35 +0100, Eric Dumazet a écrit :

 I wonder ...
 
 With 4096 cpus, do we really want to reserve 5242880 bytes of memory for
 this function ?
 
 What about following patch instead ?
 
 (Trying a dynamic memory allocation, and fallback on a single
 pre-allocated bloc of memory, shared by all cpus, protected by a
 spinlock)
 
 diff --git a/crypto/sha512_generic.c b/crypto/sha512_generic.c
 index 9ed9f60..5c80a76 100644
 --- a/crypto/sha512_generic.c
 +++ b/crypto/sha512_generic.c
 @@ -21,7 +21,6 @@
  #include linux/percpu.h
  #include asm/byteorder.h
  
 -static DEFINE_PER_CPU(u64[80], msg_schedule);
  
  static inline u64 Ch(u64 x, u64 y, u64 z)
  {
 @@ -87,10 +86,16 @@ static void
  sha512_transform(u64 *state, const u8 *input)
  {
   u64 a, b, c, d, e, f, g, h, t1, t2;
 -
 + static u64 msg_schedule[80];
 + static DEFINE_SPINLOCK(msg_schedule_lock);
   int i;
 - u64 *W = get_cpu_var(msg_schedule);
 + u64 *W = kzalloc(sizeof(msg_schedule), GFP_ATOMIC | __GFP_NOWARN);
  

And a plain kmalloc() is enough, since we fully initialize the array a
bit later.

for (i = 0; i  16; i++)
LOAD_OP(i, W, input);
for (i = 16; i  80; i++) {
BLEND_OP(i, W);
}




--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: sha512: make it work, undo percpu message schedule

2012-01-13 Thread Eric Dumazet
Le vendredi 13 janvier 2012 à 11:41 +0100, Eric Dumazet a écrit :

 And a plain kmalloc() is enough, since we fully initialize the array a
 bit later.
 
   for (i = 0; i  16; i++)
   LOAD_OP(i, W, input);
   for (i = 16; i  80; i++) {
   BLEND_OP(i, W);
   }


Here is an official patch ;)

[PATCH v3] crypto: sha512 - Fix msg_schedule race

The percpu msg_schedule setup was unsafe as a user in a process
context can be interrupted by a softirq user which would then
scribble over the exact same work area.

Bug reported by Alexey Dobriyan, and diagnosed by Steffen Klassert.

Bug added in commit f9e2bca6c22 (crypto: sha512 - Move message schedule
W[80] to static percpu area)

Try a dynamic memory allocation, and fallback using a single static
area, guarded by a spinlock.

This removes use of percpu memory.

Reported-by: Alexey Dobriyan adobri...@gmail.com
Signed-off-by: Eric Dumazet eric.duma...@gmail.com
CC: Herbert Xu herb...@gondor.apana.org.au
CC: Adrian-Ken Rueegsegger k...@codelabs.ch
CC: Steffen Klassert steffen.klass...@secunet.com
---
 crypto/sha512_generic.c |   18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/crypto/sha512_generic.c b/crypto/sha512_generic.c
index 9ed9f60..b52ef9b 100644
--- a/crypto/sha512_generic.c
+++ b/crypto/sha512_generic.c
@@ -18,10 +18,8 @@
 #include linux/crypto.h
 #include linux/types.h
 #include crypto/sha.h
-#include linux/percpu.h
 #include asm/byteorder.h
 
-static DEFINE_PER_CPU(u64[80], msg_schedule);
 
 static inline u64 Ch(u64 x, u64 y, u64 z)
 {
@@ -87,10 +85,15 @@ static void
 sha512_transform(u64 *state, const u8 *input)
 {
u64 a, b, c, d, e, f, g, h, t1, t2;
-
+   static u64 msg_schedule[80];
+   static DEFINE_SPINLOCK(msg_schedule_lock);
int i;
-   u64 *W = get_cpu_var(msg_schedule);
+   u64 *W = kmalloc(sizeof(msg_schedule), GFP_ATOMIC | __GFP_NOWARN);
 
+   if (!W) {
+   spin_lock_bh(msg_schedule_lock);
+   W = msg_schedule;
+   }
/* load the input */
 for (i = 0; i  16; i++)
 LOAD_OP(i, W, input);
@@ -128,8 +131,11 @@ sha512_transform(u64 *state, const u8 *input)
 
/* erase our data */
a = b = c = d = e = f = g = h = t1 = t2 = 0;
-   memset(W, 0, sizeof(__get_cpu_var(msg_schedule)));
-   put_cpu_var(msg_schedule);
+   memset(W, 0, sizeof(msg_schedule));
+   if (W == msg_schedule)
+   spin_unlock_bh(msg_schedule_lock);
+   else
+   kfree(W);
 }
 
 static int


--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: sha512: make it work, undo percpu message schedule

2012-01-13 Thread Steffen Klassert
On Fri, Jan 13, 2012 at 11:35:42AM +0100, Eric Dumazet wrote:
 Le vendredi 13 janvier 2012 à 18:08 +1100, Herbert Xu a écrit :
  On Fri, Jan 13, 2012 at 02:55:14AM +0300, Alexey Dobriyan wrote:
  
   Herbert, I couldn't come up with a single scenario. :-(
   But the bug is easy to reproduce.
  
  OK, does this patch work for you?
  
  commit 31f4e55c09c1170f8b813c14b1299b70f50db414
  Author: Herbert Xu herb...@gondor.apana.org.au
  Date:   Fri Jan 13 18:06:50 2012 +1100
  
  crypto: sha512 - Fix msg_schedule race
  
  The percpu msg_schedule setup was unsafe as a user in a process
  context can be interrupted by a softirq user which would then
  scribble over the exact same work area.  This was discovered by
  Steffen Klassert.
  
  This patch based on ideas from Eric Dumazet fixes this by using
  two independent work areas.
  
  Reported-by: Alexey Dobriyan adobri...@gmail.com
  Signed-off-by: Herbert Xu herb...@gondor.apana.org.au
  
 
 I wonder ...
 
 With 4096 cpus, do we really want to reserve 5242880 bytes of memory for
 this function ?
 
 What about following patch instead ?
 
 (Trying a dynamic memory allocation, and fallback on a single
 pre-allocated bloc of memory, shared by all cpus, protected by a
 spinlock)

If we want to do dynamic memory allocation, we could place it to the
shash_desc context where we already store the intermediate digest value.
This is preallocated anyway, so we don't need to do another allocation.

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: sha512: make it work, undo percpu message schedule

2012-01-13 Thread Alexey Dobriyan
On 1/13/12, Eric Dumazet eric.duma...@gmail.com wrote:

 + static u64 msg_schedule[80];
 + static DEFINE_SPINLOCK(msg_schedule_lock);

No guys, no.

SHA-512 only needs u64[16] running window for message scheduling.

I'm sending whitespace mangled patch which is only tested
with selfcryptotest passed, so you won't apply something complex.

Stackspace usage drops down to like this:

-139:   48 81 ec c8 02 00 00sub$0x2c8,%rsp
+136:   48 81 ec 18 01 00 00sub$0x118,%rsp

--- a/crypto/sha512_generic.c
+++ b/crypto/sha512_generic.c
@@ -21,8 +21,6 @@
 #include linux/percpu.h
 #include asm/byteorder.h

-static DEFINE_PER_CPU(u64[80], msg_schedule);
-
 static inline u64 Ch(u64 x, u64 y, u64 z)
 {
 return z ^ (x  (y ^ z));
@@ -80,7 +78,7 @@ static inline void LOAD_OP(int I, u64 *W, const u8 *input)

 static inline void BLEND_OP(int I, u64 *W)
 {
-   W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
+   W[I%16] = s1(W[(I-2)%16]) + W[(I-7)%16] + s0(W[(I-15)%16]) + W[I%16];
 }

 static void
@@ -89,38 +87,48 @@ sha512_transform(u64 *state, const u8 *input)
u64 a, b, c, d, e, f, g, h, t1, t2;

int i;
-   u64 *W = get_cpu_var(msg_schedule);
+   u64 W[16];

/* load the input */
 for (i = 0; i  16; i++)
 LOAD_OP(i, W, input);

-for (i = 16; i  80; i++) {
-BLEND_OP(i, W);
-}
-
/* load the state into our registers */
a=state[0];   b=state[1];   c=state[2];   d=state[3];
e=state[4];   f=state[5];   g=state[6];   h=state[7];

-   /* now iterate */
-   for (i=0; i80; i+=8) {
-   t1 = h + e1(e) + Ch(e,f,g) + sha512_K[i  ] + W[i  ];
-   t2 = e0(a) + Maj(a,b,c);d+=t1;h=t1+t2;
-   t1 = g + e1(d) + Ch(d,e,f) + sha512_K[i+1] + W[i+1];
-   t2 = e0(h) + Maj(h,a,b);c+=t1;g=t1+t2;
-   t1 = f + e1(c) + Ch(c,d,e) + sha512_K[i+2] + W[i+2];
-   t2 = e0(g) + Maj(g,h,a);b+=t1;f=t1+t2;
-   t1 = e + e1(b) + Ch(b,c,d) + sha512_K[i+3] + W[i+3];
-   t2 = e0(f) + Maj(f,g,h);a+=t1;e=t1+t2;
-   t1 = d + e1(a) + Ch(a,b,c) + sha512_K[i+4] + W[i+4];
-   t2 = e0(e) + Maj(e,f,g);h+=t1;d=t1+t2;
-   t1 = c + e1(h) + Ch(h,a,b) + sha512_K[i+5] + W[i+5];
-   t2 = e0(d) + Maj(d,e,f);g+=t1;c=t1+t2;
-   t1 = b + e1(g) + Ch(g,h,a) + sha512_K[i+6] + W[i+6];
-   t2 = e0(c) + Maj(c,d,e);f+=t1;b=t1+t2;
-   t1 = a + e1(f) + Ch(f,g,h) + sha512_K[i+7] + W[i+7];
-   t2 = e0(b) + Maj(b,c,d);e+=t1;a=t1+t2;
+#define SHA512_0_15(i, a, b, c, d, e, f, g, h) \
+   t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i] + W[i];  \
+   t2 = e0(a) + Maj(a,b,c);\
+   d += t1;\
+   h = t1 + t2
+
+#define SHA512_16_79(i, a, b, c, d, e, f, g, h)\
+   BLEND_OP(i, W); \
+   t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i] + W[(i)%16]; \
+   t2 = e0(a) + Maj(a,b,c);\
+   d += t1;\
+   h = t1 + t2
+
+   for (i = 0; i  16; i += 8) {
+   SHA512_0_15(i, a, b, c, d, e, f, g, h);
+   SHA512_0_15(i + 1, h, a, b, c, d, e, f, g);
+   SHA512_0_15(i + 2, g, h, a, b, c, d, e, f);
+   SHA512_0_15(i + 3, f, g, h, a, b, c, d, e);
+   SHA512_0_15(i + 4, e, f, g, h, a, b, c, d);
+   SHA512_0_15(i + 5, d, e, f, g, h, a, b, c);
+   SHA512_0_15(i + 6, c, d, e, f, g, h, a, b);
+   SHA512_0_15(i + 7, b, c, d, e, f, g, h, a);
+   }
+   for (i = 16; i  80; i += 8) {
+   SHA512_16_79(i, a, b, c, d, e, f, g, h);
+   SHA512_16_79(i + 1, h, a, b, c, d, e, f, g);
+   SHA512_16_79(i + 2, g, h, a, b, c, d, e, f);
+   SHA512_16_79(i + 3, f, g, h, a, b, c, d, e);
+   SHA512_16_79(i + 4, e, f, g, h, a, b, c, d);
+   SHA512_16_79(i + 5, d, e, f, g, h, a, b, c);
+   SHA512_16_79(i + 6, c, d, e, f, g, h, a, b);
+   SHA512_16_79(i + 7, b, c, d, e, f, g, h, a);
}

state[0] += a; state[1] += b; state[2] += c; state[3] += d;
@@ -128,8 +136,6 @@ sha512_transform(u64 *state, const u8 *input)

/* erase our data */
a = b = c = d = e = f = g = h = t1 = t2 = 0;
-   memset(W, 0, sizeof(__get_cpu_var(msg_schedule)));
-   put_cpu_var(msg_schedule);
 }

 static int
--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: sha512: make it work, undo percpu message schedule

2012-01-13 Thread David Laight
 
 Trying a dynamic memory allocation, and fallback on a single
 pre-allocated bloc of memory, shared by all cpus, protected by a
 spinlock
...
 -
 + static u64 msg_schedule[80];
 + static DEFINE_SPINLOCK(msg_schedule_lock);
   int i;
 - u64 *W = get_cpu_var(msg_schedule);
 + u64 *W = kzalloc(sizeof(msg_schedule), GFP_ATOMIC |
__GFP_NOWARN);
  
 + 
 + if (!W) {
 + spin_lock_bh(msg_schedule_lock);
 + W = msg_schedule;
 + }

If this code can be called from an ISR is the kmalloc()
call safe?

If the above is safe, wouldn't it be better to:
1) try to use the static buffer
2) try to kalloc() a buffer
3) spinwait for the static buffer to be free

David



--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: sha512: make it work, undo percpu message schedule

2012-01-13 Thread Eric Dumazet
Le vendredi 13 janvier 2012 à 13:33 +0200, Alexey Dobriyan a écrit :
 On 1/13/12, Eric Dumazet eric.duma...@gmail.com wrote:
 
  +   static u64 msg_schedule[80];
  +   static DEFINE_SPINLOCK(msg_schedule_lock);
 
 No guys, no.
 
 SHA-512 only needs u64[16] running window for message scheduling.
 
 I'm sending whitespace mangled patch which is only tested
 with selfcryptotest passed, so you won't apply something complex.
 
 Stackspace usage drops down to like this:
 
 -139:   48 81 ec c8 02 00 00sub$0x2c8,%rsp
 +136:   48 81 ec 18 01 00 00sub$0x118,%rsp
 
 --- a/crypto/sha512_generic.c
 +++ b/crypto/sha512_generic.c
 @@ -21,8 +21,6 @@
  #include linux/percpu.h
  #include asm/byteorder.h
 
 -static DEFINE_PER_CPU(u64[80], msg_schedule);
 -
  static inline u64 Ch(u64 x, u64 y, u64 z)
  {
  return z ^ (x  (y ^ z));
 @@ -80,7 +78,7 @@ static inline void LOAD_OP(int I, u64 *W, const u8 *input)
 
  static inline void BLEND_OP(int I, u64 *W)
  {
 - W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
 + W[I%16] = s1(W[(I-2)%16]) + W[(I-7)%16] + s0(W[(I-15)%16]) + W[I%16];
  }
 
  static void
 @@ -89,38 +87,48 @@ sha512_transform(u64 *state, const u8 *input)
   u64 a, b, c, d, e, f, g, h, t1, t2;
 
   int i;
 - u64 *W = get_cpu_var(msg_schedule);
 + u64 W[16];
 
   /* load the input */
  for (i = 0; i  16; i++)
  LOAD_OP(i, W, input);
 
 -for (i = 16; i  80; i++) {
 -BLEND_OP(i, W);
 -}
 -
   /* load the state into our registers */
   a=state[0];   b=state[1];   c=state[2];   d=state[3];
   e=state[4];   f=state[5];   g=state[6];   h=state[7];
 
 - /* now iterate */
 - for (i=0; i80; i+=8) {
 - t1 = h + e1(e) + Ch(e,f,g) + sha512_K[i  ] + W[i  ];
 - t2 = e0(a) + Maj(a,b,c);d+=t1;h=t1+t2;
 - t1 = g + e1(d) + Ch(d,e,f) + sha512_K[i+1] + W[i+1];
 - t2 = e0(h) + Maj(h,a,b);c+=t1;g=t1+t2;
 - t1 = f + e1(c) + Ch(c,d,e) + sha512_K[i+2] + W[i+2];
 - t2 = e0(g) + Maj(g,h,a);b+=t1;f=t1+t2;
 - t1 = e + e1(b) + Ch(b,c,d) + sha512_K[i+3] + W[i+3];
 - t2 = e0(f) + Maj(f,g,h);a+=t1;e=t1+t2;
 - t1 = d + e1(a) + Ch(a,b,c) + sha512_K[i+4] + W[i+4];
 - t2 = e0(e) + Maj(e,f,g);h+=t1;d=t1+t2;
 - t1 = c + e1(h) + Ch(h,a,b) + sha512_K[i+5] + W[i+5];
 - t2 = e0(d) + Maj(d,e,f);g+=t1;c=t1+t2;
 - t1 = b + e1(g) + Ch(g,h,a) + sha512_K[i+6] + W[i+6];
 - t2 = e0(c) + Maj(c,d,e);f+=t1;b=t1+t2;
 - t1 = a + e1(f) + Ch(f,g,h) + sha512_K[i+7] + W[i+7];
 - t2 = e0(b) + Maj(b,c,d);e+=t1;a=t1+t2;
 +#define SHA512_0_15(i, a, b, c, d, e, f, g, h)   \
 + t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i] + W[i];  \
 + t2 = e0(a) + Maj(a,b,c);\
 + d += t1;\
 + h = t1 + t2
 +
 +#define SHA512_16_79(i, a, b, c, d, e, f, g, h)  \
 + BLEND_OP(i, W); \
 + t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i] + W[(i)%16]; \
 + t2 = e0(a) + Maj(a,b,c);\
 + d += t1;\
 + h = t1 + t2
 +
 + for (i = 0; i  16; i += 8) {
 + SHA512_0_15(i, a, b, c, d, e, f, g, h);
 + SHA512_0_15(i + 1, h, a, b, c, d, e, f, g);
 + SHA512_0_15(i + 2, g, h, a, b, c, d, e, f);
 + SHA512_0_15(i + 3, f, g, h, a, b, c, d, e);
 + SHA512_0_15(i + 4, e, f, g, h, a, b, c, d);
 + SHA512_0_15(i + 5, d, e, f, g, h, a, b, c);
 + SHA512_0_15(i + 6, c, d, e, f, g, h, a, b);
 + SHA512_0_15(i + 7, b, c, d, e, f, g, h, a);
 + }
 + for (i = 16; i  80; i += 8) {
 + SHA512_16_79(i, a, b, c, d, e, f, g, h);
 + SHA512_16_79(i + 1, h, a, b, c, d, e, f, g);
 + SHA512_16_79(i + 2, g, h, a, b, c, d, e, f);
 + SHA512_16_79(i + 3, f, g, h, a, b, c, d, e);
 + SHA512_16_79(i + 4, e, f, g, h, a, b, c, d);
 + SHA512_16_79(i + 5, d, e, f, g, h, a, b, c);
 + SHA512_16_79(i + 6, c, d, e, f, g, h, a, b);
 + SHA512_16_79(i + 7, b, c, d, e, f, g, h, a);
   }
 
   state[0] += a; state[1] += b; state[2] += c; state[3] += d;
 @@ -128,8 +136,6 @@ sha512_transform(u64 *state, const u8 *input)
 
   /* erase our data */
   a = b = c = d = e = f = g = h = t1 = t2 = 0;
 - memset(W, 0, sizeof(__get_cpu_var(msg_schedule)));
 - put_cpu_var(msg_schedule);
  }
 
  static int


Even if its true, its not stable material.

stable teams want obvious patches.



--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to