On Sun, 01 Jun 2014 12:00:08 -0500 Rob Landley <[email protected]> wrote:
> On 05/15/14 11:50, Ivo van Poorten wrote:
> > cur-hg 8.6
> > daniel 6.7
> > ivo 4.9
> > md5sum 2.8
> > openssl 2.7
The attached patch averages out to 3.2s on the same dataset and cpu.
> Could you get me a patch on top of what's currently there?
name old new delta
-----------------------------------------------------------------------
md5_transform 243 2011 1768
md5rot 64 0 -64
md5table 256 0 -256
-----------------------------------------------------------------------
1448
total
Here's a new version with only a swap 'n rol macro and the rest
unrolled and inlined. Indeed a lot bigger, but also quite a bit faster.
Not yet as fast as gnu's md5sum or openssl, but we're getting closer.
Perhaps the memcpy() is becoming a bottleneck.
Regards,
Ivo
diff -r 9fd2bcedbeb5 toys/lsb/md5sum.c
--- a/toys/lsb/md5sum.c Tue Jun 03 06:27:24 2014 -0500
+++ b/toys/lsb/md5sum.c Sat Jun 07 15:29:17 2014 +0200
@@ -48,65 +48,95 @@
#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+#define swap_and_rol(r) swap=x3; x3=x2; x2=x1; x1 += rol(temp, r); x0=swap;
+
// for(i=0; i<64; i++) md5table[i] = abs(sin(i+1))*(1<<32); But calculating
// that involves not just floating point but pulling in -lm (and arguing with
-// C about whether 1<<32 is a valid thing to do on 32 bit platforms) so:
-
-static uint32_t md5table[64] = {
- 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
- 0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
- 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
- 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
- 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
- 0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
- 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
- 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
- 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
- 0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
- 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
-};
-
-static const uint8_t md5rot[64] = {
- 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
- 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20,
- 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
- 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21
-};
+// C about whether 1<<32 is a valid thing to do on 32 bit platforms) so...
// Mix next 64 bytes of data into md5 hash
static void md5_transform(void)
{
- unsigned x[4], *b = TT.buffer.i;
- int i;
+ unsigned temp, x0, x1, x2, x3, swap, *b = TT.buffer.i;
- memcpy(x, TT.state, sizeof(x));
+ x0 = TT.state[0];
+ x1 = TT.state[1];
+ x2 = TT.state[2];
+ x3 = TT.state[3];
- for (i=0; i<64; i++) {
- unsigned int in, temp, swap;
- if (i<16) {
- in = i;
- temp = x[1];
- temp = (temp & x[2]) | ((~temp) & x[3]);
- } else if (i<32) {
- in = (1+(5*i))&15;
- temp = x[3];
- temp = (x[1] & temp) | (x[2] & ~temp);
- } else if (i<48) {
- in = (3*i+5)&15;
- temp = x[1] ^ x[2] ^ x[3];
- } else {
- in = (7*i)&15;
- temp = x[2] ^ (x[1] | ~x[3]);
- }
- temp += x[0] + b[in] + md5table[i];
- swap = x[3];
- x[3] = x[2];
- x[2] = x[1];
- x[1] += rol(temp, md5rot[i]);
- x[0] = swap;
- }
- for (i=0; i<4; i++) TT.state[i] += x[i];
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 0] + 0xd76aa478; swap_and_rol( 7);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 1] + 0xe8c7b756; swap_and_rol(12);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 2] + 0x242070db; swap_and_rol(17);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 3] + 0xc1bdceee; swap_and_rol(22);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 4] + 0xf57c0faf; swap_and_rol( 7);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 5] + 0x4787c62a; swap_and_rol(12);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 6] + 0xa8304613; swap_and_rol(17);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 7] + 0xfd469501; swap_and_rol(22);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 8] + 0x698098d8; swap_and_rol( 7);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[ 9] + 0x8b44f7af; swap_and_rol(12);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[10] + 0xffff5bb1; swap_and_rol(17);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[11] + 0x895cd7be; swap_and_rol(22);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[12] + 0x6b901122; swap_and_rol( 7);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[13] + 0xfd987193; swap_and_rol(12);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[14] + 0xa679438e; swap_and_rol(17);
+ temp = ((x1 & x2) | ((~x1) & x3)) + x0 + b[15] + 0x49b40821; swap_and_rol(22);
+
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 1] + 0xf61e2562; swap_and_rol( 5);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 6] + 0xc040b340; swap_and_rol( 9);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[11] + 0x265e5a51; swap_and_rol(14);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 0] + 0xe9b6c7aa; swap_and_rol(20);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 5] + 0xd62f105d; swap_and_rol( 5);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[10] + 0x02441453; swap_and_rol( 9);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[15] + 0xd8a1e681; swap_and_rol(14);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 4] + 0xe7d3fbc8; swap_and_rol(20);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 9] + 0x21e1cde6; swap_and_rol( 5);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[14] + 0xc33707d6; swap_and_rol( 9);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 3] + 0xf4d50d87; swap_and_rol(14);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 8] + 0x455a14ed; swap_and_rol(20);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[13] + 0xa9e3e905; swap_and_rol( 5);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 2] + 0xfcefa3f8; swap_and_rol( 9);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[ 7] + 0x676f02d9; swap_and_rol(14);
+ temp = ((x1 & x3) | (x2 & ~x3)) + x0 + b[12] + 0x8d2a4c8a; swap_and_rol(20);
+
+ temp = (x1 ^ x2 ^ x3) + x0 + b[ 5] + 0xfffa3942; swap_and_rol( 4);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[ 8] + 0x8771f681; swap_and_rol(11);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[11] + 0x6d9d6122; swap_and_rol(16);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[14] + 0xfde5380c; swap_and_rol(23);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[ 1] + 0xa4beea44; swap_and_rol( 4);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[ 4] + 0x4bdecfa9; swap_and_rol(11);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[ 7] + 0xf6bb4b60; swap_and_rol(16);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[10] + 0xbebfbc70; swap_and_rol(23);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[13] + 0x289b7ec6; swap_and_rol( 4);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[ 0] + 0xeaa127fa; swap_and_rol(11);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[ 3] + 0xd4ef3085; swap_and_rol(16);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[ 6] + 0x04881d05; swap_and_rol(23);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[ 9] + 0xd9d4d039; swap_and_rol( 4);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[12] + 0xe6db99e5; swap_and_rol(11);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[15] + 0x1fa27cf8; swap_and_rol(16);
+ temp = (x1 ^ x2 ^ x3) + x0 + b[ 2] + 0xc4ac5665; swap_and_rol(23);
+
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 0] + 0xf4292244; swap_and_rol( 6);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 7] + 0x432aff97; swap_and_rol(10);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[14] + 0xab9423a7; swap_and_rol(15);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 5] + 0xfc93a039; swap_and_rol(21);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[12] + 0x655b59c3; swap_and_rol( 6);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 3] + 0x8f0ccc92; swap_and_rol(10);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[10] + 0xffeff47d; swap_and_rol(15);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 1] + 0x85845dd1; swap_and_rol(21);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 8] + 0x6fa87e4f; swap_and_rol( 6);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[15] + 0xfe2ce6e0; swap_and_rol(10);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 6] + 0xa3014314; swap_and_rol(15);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[13] + 0x4e0811a1; swap_and_rol(21);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 4] + 0xf7537e82; swap_and_rol( 6);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[11] + 0xbd3af235; swap_and_rol(10);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 2] + 0x2ad7d2bb; swap_and_rol(15);
+ temp = (x2 ^ (x1 | ~x3)) + x0 + b[ 9] + 0xeb86d391; swap_and_rol(21);
+
+ TT.state[0] += x0;
+ TT.state[1] += x1;
+ TT.state[2] += x2;
+ TT.state[3] += x3;
}
// Mix next 64 bytes of data into sha1 hash.
_______________________________________________
Toybox mailing list
[email protected]
http://lists.landley.net/listinfo.cgi/toybox-landley.net