Re: Mozilla SHA1 implementation

2005-04-22 Thread Paul Mackerras
Linus Torvalds writes:

 I've just integrated the Mozilla SHA1 library implementation that Adgar
 Toernig sent me into the standard git archive (but I did the integration
 differently).

Here is a new PPC SHA1 patch that integrates better with this...

 Interestingly, the Mozilla SHA1 code is about twice as fast as the openssl
 code on my G5, and judging by the disassembly, it's because it's much
 simpler. I think the openssl people have unrolled all the loops totally,
 which tends to be a disaster on any half-way modern CPU. But hey, it could
 be something as simple as optimization flags too.

Very interesting.  On my G4 powerbook (since I am at LCA), for a
fsck-cache on a linux-2.6 tree, it takes 6.6 seconds with the openssl
SHA1, 10.7 seconds with the Mozilla SHA1, and ~5.8 seconds with my
SHA1.  I'll test it on a G5 tonight, hopefully.

Paul.

diff -urN git.orig/Makefile git/Makefile
--- git.orig/Makefile   2005-04-22 16:23:44.0 +1000
+++ git/Makefile2005-04-22 16:43:31.0 +1000
@@ -34,9 +34,14 @@
   SHA1_HEADER=mozilla-sha1/sha1.h
   LIB_OBJS += mozilla-sha1/sha1.o
 else
+ifdef PPC_SHA1
+  SHA1_HEADER=ppc/sha1.h
+  LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
+else
   SHA1_HEADER=openssl/sha.h
   LIBS += -lssl
 endif
+endif
 
 CFLAGS += '-DSHA1_HEADER=$(SHA1_HEADER)'
 
@@ -77,7 +82,7 @@
 write-tree.o: $(LIB_H)
 
 clean:
-   rm -f *.o mozilla-sha1/*.o $(PROG) $(LIB_FILE)
+   rm -f *.o mozilla-sha1/*.o ppc/*.o $(PROG) $(LIB_FILE)
 
 backup: clean
cd .. ; tar czvf dircache.tar.gz dir-cache
diff -urN git.orig/ppc/sha1.c git/ppc/sha1.c
--- /dev/null   2005-04-04 12:56:19.0 +1000
+++ git/ppc/sha1.c  2005-04-22 16:29:19.0 +1000
@@ -0,0 +1,72 @@
+/*
+ * SHA-1 implementation.
+ *
+ * Copyright (C) 2005 Paul Mackerras [EMAIL PROTECTED]
+ *
+ * This version assumes we are running on a big-endian machine.
+ * It calls an external sha1_core() to process blocks of 64 bytes.
+ */
+#include stdio.h
+#include string.h
+#include sha1.h
+
+extern void sha1_core(uint32_t *hash, const unsigned char *p,
+ unsigned int nblocks);
+
+int SHA1_Init(SHA_CTX *c)
+{
+   c-hash[0] = 0x67452301;
+   c-hash[1] = 0xEFCDAB89;
+   c-hash[2] = 0x98BADCFE;
+   c-hash[3] = 0x10325476;
+   c-hash[4] = 0xC3D2E1F0;
+   c-len = 0;
+   c-cnt = 0;
+   return 0;
+}
+
+int SHA1_Update(SHA_CTX *c, const void *ptr, unsigned long n)
+{
+   unsigned long nb;
+   const unsigned char *p = ptr;
+
+   c-len += n  3;
+   while (n != 0) {
+   if (c-cnt || n  64) {
+   nb = 64 - c-cnt;
+   if (nb  n)
+   nb = n;
+   memcpy(c-buf.b[c-cnt], p, nb);
+   if ((c-cnt += nb) == 64) {
+   sha1_core(c-hash, c-buf.b, 1);
+   c-cnt = 0;
+   }
+   } else {
+   nb = n  6;
+   sha1_core(c-hash, p, nb);
+   nb = 6;
+   }
+   n -= nb;
+   p += nb;
+   }
+   return 0;
+}  
+
+int SHA1_Final(unsigned char *hash, SHA_CTX *c)
+{
+   unsigned int cnt = c-cnt;
+
+   c-buf.b[cnt++] = 0x80;
+   if (cnt  56) {
+   if (cnt  64)
+   memset(c-buf.b[cnt], 0, 64 - cnt);
+   sha1_core(c-hash, c-buf.b, 1);
+   cnt = 0;
+   }
+   if (cnt  56)
+   memset(c-buf.b[cnt], 0, 56 - cnt);
+   c-buf.l[7] = c-len;
+   sha1_core(c-hash, c-buf.b, 1);
+   memcpy(hash, c-hash, 20);
+   return 0;
+}
diff -urN git.orig/ppc/sha1.h git/ppc/sha1.h
--- /dev/null   2005-04-04 12:56:19.0 +1000
+++ git/ppc/sha1.h  2005-04-22 16:45:28.0 +1000
@@ -0,0 +1,20 @@
+/*
+ * SHA-1 implementation.
+ *
+ * Copyright (C) 2005 Paul Mackerras [EMAIL PROTECTED]
+ */
+#include stdint.h
+
+typedef struct sha_context {
+   uint32_t hash[5];
+   uint32_t cnt;
+   uint64_t len;
+   union {
+   unsigned char b[64];
+   uint64_t l[8];
+   } buf;
+} SHA_CTX;
+
+int SHA1_Init(SHA_CTX *c);
+int SHA1_Update(SHA_CTX *c, const void *p, unsigned long n);
+int SHA1_Final(unsigned char *hash, SHA_CTX *c);
diff -urN git.orig/ppc/sha1ppc.S git/ppc/sha1ppc.S
--- /dev/null   2005-04-04 12:56:19.0 +1000
+++ git/ppc/sha1ppc.S   2005-04-22 16:29:19.0 +1000
@@ -0,0 +1,185 @@
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras.
+ */
+#define FS 80
+
+/*
+ * We roll the registers for T, A, B, C, D, E around on each
+ * iteration; T on iteration t is A on iteration t+1, and so on.
+ * We use registers 7 - 12 for this.
+ */
+#define RT(t)  t)+5)%6)+7)
+#define RA(t)  t)+4)%6)+7)
+#define RB(t)  t)+3)%6)+7)
+#define RC(t)  t)+2)%6)+7)
+#define RD(t)  t)+1)%6)+7)
+#define RE(t)  

Re: Mozilla SHA1 implementation

2005-04-22 Thread Paul Mackerras
Linus Torvalds writes:

 Interestingly, the Mozilla SHA1 code is about twice as fast as the openssl
 code on my G5, and judging by the disassembly, it's because it's much
 simpler. I think the openssl people have unrolled all the loops totally,
 which tends to be a disaster on any half-way modern CPU. But hey, it could
 be something as simple as optimization flags too.

Which gcc version are you using?

I get the opposite result on my 2GHz G5: the Mozilla version does
45MB/s, the openssl version does 135MB/s, and my version does 218MB/s.
The time for a fsck-cache on a linux-2.6 tree (cache hot) is 8.0
seconds for the Mozilla version, 5.2 seconds for the openssl version,
and 4.4 seconds for my version.

Paul.
-
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html