On Wed, 14 May 2014 21:56:13 -0700 Daniel Verkamp <[email protected]> wrote:
> Here's a quick cleanup of md5sum. Executive summary: smaller and
> faster.
Nice! It got me excited to see if I could get it faster.
Times in seconds on my machine (IA32 Sempron 1.66MHz):
cur-hg 8.6
daniel 6.7
ivo 4.9
md5sum 2.8
openssl 2.7
I removed the conditionals out of the inner loop, introduced another
LUT, made the LUTs align.
bloatcheck:
-----------------------------------------------------------------------
intab 0 256 256
md5rot 0 256 256
md5_transform 366 365 -1
-----------------------------------------------------------------------
511
The tables can be downsized to 64 bytes, but it'll make it a bit
slower, especially on architectures where non-aligned reads are slower.
The patch is relative to current hg and includes Daniel's changes.
diff -r 55de00e9daf4 toys/lsb/md5sum.c
--- a/toys/lsb/md5sum.c Tue May 13 19:45:01 2014 -0500
+++ b/toys/lsb/md5sum.c Thu May 15 18:42:58 2014 +0200
@@ -46,6 +46,8 @@
} buffer;
)
+#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
// for(i=0; i<64; i++) md5table[i] = abs(sin(i+1))*(1<<32); But calculating
// that involves not just floating point but pulling in -lm (and arguing with
// C about whether 1<<32 is a valid thing to do on 32 bit platforms) so:
@@ -64,52 +66,52 @@
0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
};
+static const unsigned md5rot[64] = {
+ 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
+ 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20,
+ 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
+ 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21
+};
+
+static const unsigned intab[64] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // i
+ 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, // (1+(5*i))&15
+ 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, // (3*i+5)&15
+ 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 // (7*i)&15
+};
+
+#define COMMON do { \
+ temp += x0 + b[*in++] + *md5t++; \
+ swap = x3; x3 = x2; x2 = x1; \
+ x1 += rol(temp, md5rot[i]); \
+ x0 = swap; } while(0)
+
// Mix next 64 bytes of data into md5 hash
static void md5_transform(void)
{
- unsigned x[4], *b = (unsigned *)TT.buffer.c;
- int i;
+ unsigned i, temp, x0, x1, x2, x3, swap, *b = TT.buffer.i;
+ const unsigned *in = intab, *md5t = md5table;
- memcpy(x, TT.state, sizeof(x));
+ x0 = TT.state[0];
+ x1 = TT.state[1];
+ x2 = TT.state[2];
+ x3 = TT.state[3];
- for (i=0; i<64; i++) {
- unsigned int in, a, rot, temp;
+ for (i=0; i<16; i++) { temp = (x1 & x2) | ((~x1) & x3); COMMON; }
+ for ( ; i<32; i++) { temp = (x1 & x3) | (x2 & ~x3); COMMON; }
+ for ( ; i<48; i++) { temp = x1 ^ x2 ^ x3; COMMON; }
+ for ( ; i<64; i++) { temp = x2 ^ (x1 | ~x3); COMMON; }
- a = (-i)&3;
- if (i<16) {
- in = i;
- rot = 7+(5*(i&3));
- temp = x[(a+1)&3];
- temp = (temp & x[(a+2)&3]) | ((~temp) & x[(a+3)&3]);
- } else if (i<32) {
- in = (1+(5*i))&15;
- temp = (i&3)+1;
- rot = temp*5;
- if (temp&2) rot--;
- temp = x[(a+3)&3];
- temp = (x[(a+1)&3] & temp) | (x[(a+2)&3] & ~temp);
- } else if (i<48) {
- in = (5+(3*(i&15)))&15;
- rot = i&3;
- rot = 4+(5*rot)+((rot+1)&6);
- temp = x[(a+1)&3] ^ x[(a+2)&3] ^ x[(a+3)&3];
- } else {
- in = (7*(i&15))&15;
- rot = (i&3)+1;
- rot = (5*rot)+(((rot+2)&2)>>1);
- temp = x[(a+2)&3] ^ (x[(a+1)&3] | ~x[(a+3)&3]);
- }
- temp += x[a] + b[in] + md5table[i];
- x[a] = x[(a+1)&3] + ((temp<<rot) | (temp>>(32-rot)));
- }
- for (i=0; i<4; i++) TT.state[i] += x[i];
+ TT.state[0] += x0;
+ TT.state[1] += x1;
+ TT.state[2] += x2;
+ TT.state[3] += x3;
}
// Mix next 64 bytes of data into sha1 hash.
static const unsigned rconsts[]={0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6};
-#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
static void sha1_transform(void)
{
_______________________________________________
Toybox mailing list
[email protected]
http://lists.landley.net/listinfo.cgi/toybox-landley.net