On Sun, 01 Jun 2014 12:00:08 -0500 Rob Landley <[email protected]> wrote:
> On 05/15/14 11:50, Ivo van Poorten wrote:
> > cur-hg      8.6
> > daniel      6.7
> > ivo         4.9
> > md5sum      2.8
> > openssl     2.7

The attached patch averages out to 3.2s on the same dataset and cpu.

> Could you get me a patch on top of what's currently there?

name                                           old       new      delta
-----------------------------------------------------------------------
 md5_transform                                 243      2011       1768
 md5rot                                         64         0        -64
 md5table                                      256         0       -256
-----------------------------------------------------------------------
                                                                   1448
total

Here's a new version with only a swap 'n rol macro and the rest
unrolled and inlined. Indeed a lot bigger, but also quite a bit faster.
Not yet as fast as gnu's md5sum or openssl, but we're getting closer.
Perhaps the memcpy() is becoming a bottleneck.

Regards,
Ivo
diff -r 9fd2bcedbeb5 toys/lsb/md5sum.c
--- a/toys/lsb/md5sum.c	Tue Jun 03 06:27:24 2014 -0500
+++ b/toys/lsb/md5sum.c	Sat Jun 07 15:29:17 2014 +0200
@@ -48,65 +48,95 @@
 
 #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
 
+#define swap_and_rol(r) swap=x3; x3=x2; x2=x1; x1 += rol(temp, r); x0=swap;
+
 // for(i=0; i<64; i++) md5table[i] = abs(sin(i+1))*(1<<32);  But calculating
 // that involves not just floating point but pulling in -lm (and arguing with
-// C about whether 1<<32 is a valid thing to do on 32 bit platforms) so:
-
-static uint32_t md5table[64] = {
-  0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
-  0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
-  0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
-  0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
-  0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
-  0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
-  0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
-  0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
-  0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
-  0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
-  0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
-};
-
-static const uint8_t md5rot[64] = {
-  7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
-  5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20,
-  4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
-  6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21
-};
+// C about whether 1<<32 is a valid thing to do on 32 bit platforms) so...
 
 // Mix next 64 bytes of data into md5 hash
 
 static void md5_transform(void)
 {
-  unsigned x[4], *b = TT.buffer.i;
-  int i;
+  unsigned temp, x0, x1, x2, x3, swap, *b = TT.buffer.i;
 
-  memcpy(x, TT.state, sizeof(x));
+  x0 = TT.state[0];
+  x1 = TT.state[1];
+  x2 = TT.state[2];
+  x3 = TT.state[3];
 
-  for (i=0; i<64; i++) {
-    unsigned int in, temp, swap;
-    if (i<16) {
-      in = i;
-      temp = x[1];
-      temp = (temp & x[2]) | ((~temp) & x[3]);
-    } else if (i<32) {
-      in = (1+(5*i))&15;
-      temp = x[3];
-      temp = (x[1] & temp) | (x[2] & ~temp);
-    } else if (i<48) {
-      in = (3*i+5)&15;
-      temp = x[1] ^ x[2] ^ x[3];
-    } else {
-      in = (7*i)&15;
-      temp = x[2] ^ (x[1] | ~x[3]);
-    }
-    temp += x[0] + b[in] + md5table[i];
-    swap = x[3];
-    x[3] = x[2];
-    x[2] = x[1];
-    x[1] += rol(temp, md5rot[i]);
-    x[0] = swap;
-  }
-  for (i=0; i<4; i++) TT.state[i] += x[i];
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 0] + 0xd76aa478; swap_and_rol( 7);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 1] + 0xe8c7b756; swap_and_rol(12);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 2] + 0x242070db; swap_and_rol(17);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 3] + 0xc1bdceee; swap_and_rol(22);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 4] + 0xf57c0faf; swap_and_rol( 7);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 5] + 0x4787c62a; swap_and_rol(12);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 6] + 0xa8304613; swap_and_rol(17);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 7] + 0xfd469501; swap_and_rol(22);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 8] + 0x698098d8; swap_and_rol( 7);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 9] + 0x8b44f7af; swap_and_rol(12);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[10] + 0xffff5bb1; swap_and_rol(17);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[11] + 0x895cd7be; swap_and_rol(22);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[12] + 0x6b901122; swap_and_rol( 7);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[13] + 0xfd987193; swap_and_rol(12);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[14] + 0xa679438e; swap_and_rol(17);
+  temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[15] + 0x49b40821; swap_and_rol(22);
+  
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 1] + 0xf61e2562; swap_and_rol( 5);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 6] + 0xc040b340; swap_and_rol( 9);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[11] + 0x265e5a51; swap_and_rol(14);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 0] + 0xe9b6c7aa; swap_and_rol(20);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 5] + 0xd62f105d; swap_and_rol( 5);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[10] + 0x02441453; swap_and_rol( 9);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[15] + 0xd8a1e681; swap_and_rol(14);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 4] + 0xe7d3fbc8; swap_and_rol(20);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 9] + 0x21e1cde6; swap_and_rol( 5);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[14] + 0xc33707d6; swap_and_rol( 9);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 3] + 0xf4d50d87; swap_and_rol(14);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 8] + 0x455a14ed; swap_and_rol(20);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[13] + 0xa9e3e905; swap_and_rol( 5);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 2] + 0xfcefa3f8; swap_and_rol( 9);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 7] + 0x676f02d9; swap_and_rol(14);
+  temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[12] + 0x8d2a4c8a; swap_and_rol(20);
+
+  temp = (x1 ^ x2 ^ x3) + x0 + b[ 5] + 0xfffa3942; swap_and_rol( 4);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[ 8] + 0x8771f681; swap_and_rol(11);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[11] + 0x6d9d6122; swap_and_rol(16);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[14] + 0xfde5380c; swap_and_rol(23);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[ 1] + 0xa4beea44; swap_and_rol( 4);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[ 4] + 0x4bdecfa9; swap_and_rol(11);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[ 7] + 0xf6bb4b60; swap_and_rol(16);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[10] + 0xbebfbc70; swap_and_rol(23);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[13] + 0x289b7ec6; swap_and_rol( 4);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[ 0] + 0xeaa127fa; swap_and_rol(11);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[ 3] + 0xd4ef3085; swap_and_rol(16);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[ 6] + 0x04881d05; swap_and_rol(23);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[ 9] + 0xd9d4d039; swap_and_rol( 4);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[12] + 0xe6db99e5; swap_and_rol(11);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[15] + 0x1fa27cf8; swap_and_rol(16);
+  temp = (x1 ^ x2 ^ x3) + x0 + b[ 2] + 0xc4ac5665; swap_and_rol(23);
+
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 0] + 0xf4292244; swap_and_rol( 6);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 7] + 0x432aff97; swap_and_rol(10);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[14] + 0xab9423a7; swap_and_rol(15);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 5] + 0xfc93a039; swap_and_rol(21);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[12] + 0x655b59c3; swap_and_rol( 6);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 3] + 0x8f0ccc92; swap_and_rol(10);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[10] + 0xffeff47d; swap_and_rol(15);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 1] + 0x85845dd1; swap_and_rol(21);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 8] + 0x6fa87e4f; swap_and_rol( 6);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[15] + 0xfe2ce6e0; swap_and_rol(10);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 6] + 0xa3014314; swap_and_rol(15);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[13] + 0x4e0811a1; swap_and_rol(21);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 4] + 0xf7537e82; swap_and_rol( 6);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[11] + 0xbd3af235; swap_and_rol(10);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 2] + 0x2ad7d2bb; swap_and_rol(15);
+  temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 9] + 0xeb86d391; swap_and_rol(21);
+
+  TT.state[0] += x0;
+  TT.state[1] += x1;
+  TT.state[2] += x2;
+  TT.state[3] += x3;
 }
 
 // Mix next 64 bytes of data into sha1 hash.
_______________________________________________
Toybox mailing list
[email protected]
http://lists.landley.net/listinfo.cgi/toybox-landley.net

Reply via email to