On Wed, 14 May 2014 21:56:13 -0700 Daniel Verkamp <[email protected]> wrote:
> Here's a quick cleanup of md5sum. Executive summary: smaller and
> faster.

Nice! It got me excited to see if I could get it faster.

Times in seconds on my machine (IA32 Sempron 1.66MHz):

cur-hg      8.6
daniel      6.7
ivo         4.9
md5sum      2.8
openssl     2.7

I removed the conditionals out of the inner loop, introduced another
LUT, made the LUTs align.

bloatcheck:

-----------------------------------------------------------------------
 intab                                           0       256        256
 md5rot                                          0       256        256
 md5_transform                                 366       365         -1
-----------------------------------------------------------------------
                                                                    511

The tables can be downsized to 64 bytes, but it'll make it a bit
slower, especially on architectures where non-aligned reads are slower.

The patch is relative to current hg and includes Daniel's changes.
diff -r 55de00e9daf4 toys/lsb/md5sum.c
--- a/toys/lsb/md5sum.c	Tue May 13 19:45:01 2014 -0500
+++ b/toys/lsb/md5sum.c	Thu May 15 18:42:58 2014 +0200
@@ -46,6 +46,8 @@
   } buffer;
 )
 
+#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
 // for(i=0; i<64; i++) md5table[i] = abs(sin(i+1))*(1<<32);  But calculating
 // that involves not just floating point but pulling in -lm (and arguing with
 // C about whether 1<<32 is a valid thing to do on 32 bit platforms) so:
@@ -64,52 +66,52 @@
   0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
 };
 
+static const unsigned md5rot[64] = {
+  7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
+  5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20,
+  4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
+  6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21
+};
+
+static const unsigned intab[64] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,   // i
+  1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12,   // (1+(5*i))&15
+  5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2,   // (3*i+5)&15
+  0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9    // (7*i)&15
+};
+
+#define COMMON do { \
+                temp += x0 + b[*in++] + *md5t++; \
+                swap = x3; x3 = x2; x2 = x1; \
+                x1 += rol(temp, md5rot[i]); \
+                x0 = swap; } while(0)
+
 // Mix next 64 bytes of data into md5 hash
 
 static void md5_transform(void)
 {
-  unsigned x[4], *b = (unsigned *)TT.buffer.c;
-  int i;
+  unsigned i, temp, x0, x1, x2, x3, swap, *b = TT.buffer.i;
+  const unsigned *in = intab, *md5t = md5table;
 
-  memcpy(x, TT.state, sizeof(x));
+  x0 = TT.state[0];
+  x1 = TT.state[1];
+  x2 = TT.state[2];
+  x3 = TT.state[3];
 
-  for (i=0; i<64; i++) {
-    unsigned int in, a, rot, temp;
+  for (i=0; i<16; i++) { temp = (x1 & x2) | ((~x1) & x3); COMMON; }
+  for (   ; i<32; i++) { temp = (x1 & x3) | (x2 & ~x3);   COMMON; }
+  for (   ; i<48; i++) { temp = x1 ^ x2 ^ x3;             COMMON; }
+  for (   ; i<64; i++) { temp = x2 ^ (x1 | ~x3);          COMMON; }
 
-    a = (-i)&3;
-    if (i<16) {
-      in = i;
-      rot = 7+(5*(i&3));
-      temp = x[(a+1)&3];
-      temp = (temp & x[(a+2)&3]) | ((~temp) & x[(a+3)&3]);
-    } else if (i<32) {
-      in = (1+(5*i))&15;
-      temp = (i&3)+1;
-      rot = temp*5;
-      if (temp&2) rot--;
-      temp = x[(a+3)&3];
-      temp = (x[(a+1)&3] & temp) | (x[(a+2)&3] & ~temp);
-    } else if (i<48) {
-      in = (5+(3*(i&15)))&15;
-      rot = i&3;
-      rot = 4+(5*rot)+((rot+1)&6);
-      temp = x[(a+1)&3] ^ x[(a+2)&3] ^ x[(a+3)&3];
-    } else {
-      in = (7*(i&15))&15;
-      rot = (i&3)+1;
-      rot = (5*rot)+(((rot+2)&2)>>1);
-      temp = x[(a+2)&3] ^ (x[(a+1)&3] | ~x[(a+3)&3]);
-    }
-    temp += x[a] + b[in] + md5table[i];
-    x[a] = x[(a+1)&3] + ((temp<<rot) | (temp>>(32-rot)));
-  }
-  for (i=0; i<4; i++) TT.state[i] += x[i];
+  TT.state[0] += x0;
+  TT.state[1] += x1;
+  TT.state[2] += x2;
+  TT.state[3] += x3;
 }
 
 // Mix next 64 bytes of data into sha1 hash.
 
 static const unsigned rconsts[]={0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6};
-#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
 
 static void sha1_transform(void)
 {
_______________________________________________
Toybox mailing list
[email protected]
http://lists.landley.net/listinfo.cgi/toybox-landley.net

Reply via email to