> > As someone pointed out there're machines with sizeof(int)==8 out there. > > So I'd like to reserve some extra time for elaborating on the patch by > > redefining BF_[M0-3] macros. If you allow I can also come up with some > > alternative for #ifdef spaghetti in the beginning of > > crypto/bf/bf_locl.org. > > version from ftp://ftp.openssl.org/snapshot/openssl-SNAP-19990421.tar.gz Find attached patch relative to the mentioned snapshot. Comment about #ifdef spaghetti (that was moved from crypto/bf/bf_locl.org to crypto/opensslconf.h.in by Ulf). I don't see any need for it, so I've folded the whole mumbo-jumbo to #undef BF_PTR:-) Indeed, good optimizing compiler should be perfectly able to deduct both BF_PTR and BF_PTR2 versions from the generic one. People with poor compilers (read gcc:-) would have to experiment in either case and would come across the comments in bf_locl.h... BTW. Why doesn't one turbocharge it (well, probably other algorithms used by Netscape more extensively should be the first target:-) by passing and receiving data block to be en-/decrypted by value instead of by reference? I mean like this: BF_LONG_LONG BF_encrypt (BF_LONG l,BF_LONG r,BF_KEY *key) Instead of this: void BF_encrypt (BF_LONG *data,BF_KEY *key) Well, it wouldn't make hell of a difference on Intel as arguments has to be *written* into memory (stack or array) in either case, but on RISC it could be a big hit! Or is nobody interested in anything but Intel as always? And yes, I realize it could be a pain in the ass:-) Especially receiving the result part... > > SHA might need extra consideration too then... Is coming soon. It will very likely be followed by a number of patches to other digest algorithms. Idea is to clean up this #ifdef *_ENDIAN mess, that apparently was used to mask the code that doesn't work on 64-bit little-endian platforms (a.k.a. Alpha-based). Cheers. Andy.
*** ./crypto/bf/blowfish.h.orig Tue Apr 20 18:00:10 1999 --- ./crypto/bf/blowfish.h Mon Apr 26 19:24:15 1999 *************** *** 66,73 **** #define BF_ENCRYPT 1 #define BF_DECRYPT 0 ! #ifdef WIN16 #define BF_LONG unsigned long #else #define BF_LONG unsigned int #endif --- 66,90 ---- #define BF_ENCRYPT 1 #define BF_DECRYPT 0 ! /* ! * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! * ! BF_LONG has to be at least 32 bits wide. If it's wider, then ! ! * ! BF_LONG_LOG2 has to be defined along. ! ! * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! */ ! ! #if defined(WIN16) || defined(__LP32__) #define BF_LONG unsigned long + #elif defined(_CRAY) || defined(__ILP64__) + #define BF_LONG unsigned long + #define BF_LONG_LOG2 3 + /* + * _CRAY note. I could declare short, but I have no idea what impact + * does it have on performance on none-T3E machines. I could declare + * int, but at least on C90 sizeof(int) can be chosen at compile time. + * So I've chosen long... + * <[EMAIL PROTECTED]> + */ #else #define BF_LONG unsigned int #endif *** ./crypto/bf/bf_locl.h.orig Wed Apr 21 19:30:49 1999 --- ./crypto/bf/bf_locl.h Mon Apr 26 19:21:39 1999 *************** *** 151,206 **** /* This is actually a big endian algorithm, the most significate byte * is used to lookup array 0 */ - #define BF_M 0x3fc - #define BF_0 22L - #define BF_1 14L - #define BF_2 6L - #define BF_3 2L /* left shift */ - #if defined(BF_PTR2) ! /* This is basically a special pentium verson */ ! #define BF_ENC(LL,R,S,P) \ ! { \ ! BF_LONG t,u,v; \ ! u=R>>BF_0; \ ! v=R>>BF_1; \ ! u&=BF_M; \ ! v&=BF_M; \ ! t= *(BF_LONG *)((unsigned char *)&(S[ 0])+u); \ ! u=R>>BF_2; \ ! t+= *(BF_LONG *)((unsigned char *)&(S[256])+v); \ ! v=R<<BF_3; \ ! u&=BF_M; \ ! v&=BF_M; \ ! t^= *(BF_LONG *)((unsigned char *)&(S[512])+u); \ ! LL^=P; \ ! t+= *(BF_LONG *)((unsigned char *)&(S[768])+v); \ ! LL^=t; \ ! } #elif defined(BF_PTR) ! /* This is normally very good */ ! #define BF_ENC(LL,R,S,P) \ ! LL^=P; \ LL^= (((*(BF_LONG *)((unsigned char *)&(S[ 0])+((R>>BF_0)&BF_M))+ \ *(BF_LONG *)((unsigned char *)&(S[256])+((R>>BF_1)&BF_M)))^ \ *(BF_LONG *)((unsigned char *)&(S[512])+((R>>BF_2)&BF_M)))+ \ ! *(BF_LONG *)((unsigned char *)&(S[768])+((R<<BF_3)&BF_M))); #else ! /* This will always work, even on 64 bit machines and strangly enough, ! * on the Alpha it is faster than the pointer versions (both 32 and 64 ! * versions of BF_LONG) */ ! #define BF_ENC(LL,R,S,P) \ ! LL^=P; \ ! LL^=((( S[ (int)(R>>24L) ] + \ ! S[0x0100+((int)(R>>16L)&0xff)])^ \ ! S[0x0200+((int)(R>> 8L)&0xff)])+ \ ! S[0x0300+((int)(R )&0xff)])&0xffffffffL; #endif #endif --- 151,219 ---- /* This is actually a big endian algorithm, the most significate byte * is used to lookup array 0 */ #if defined(BF_PTR2) ! /* ! * This is basically a special Intel version. Point is that Intel ! * doesn't have many registers, but offers a reach choice of addressing ! * modes. So we spare some registers by directly traversing BF_KEY ! * structure and hiring the most decorated addressing mode. The code ! * generated by EGCS is *perfectly* competitive with assembler ! * implementation! ! */ ! #define BF_ENC(LL,R,KEY,Pi) (\ ! LL^=KEY[Pi], \ ! t= KEY[BF_ROUNDS+2 + 0 + ((R>>24)&0xFF)], \ ! t+= KEY[BF_ROUNDS+2 + 256 + ((R>>16)&0xFF)], \ ! t^= KEY[BF_ROUNDS+2 + 512 + ((R>>8 )&0xFF)], \ ! t+= KEY[BF_ROUNDS+2 + 768 + ((R )&0xFF)], \ ! LL^=t \ ! ) #elif defined(BF_PTR) ! #ifndef BF_LONG_LOG2 ! #define BF_LONG_LOG2 2 /* default to BF_LONG being 32 bits */ ! #endif ! #define BF_M (0xFF<<BF_LONG_LOG2) ! #define BF_0 (24-BF_LONG_LOG2) ! #define BF_1 (16-BF_LONG_LOG2) ! #define BF_2 ( 8-BF_LONG_LOG2) ! #define BF_3 BF_LONG_LOG2 /* left shift */ ! /* ! * This is normally very good on RISC platforms where normally you ! * have to explicitely "multiplicate" array index by sizeof(BF_LONG) ! * in order to caclulate the effective address. This implementation ! * excuses CPU from this extra work. Power[PC] uses should have most ! * fun as (R>>BF_i)&BF_M gets folded into a single instruction, namely ! * rlwinm. So let'em double-check if their compiler does it. ! */ ! ! #define BF_ENC(LL,R,S,P) ( \ ! LL^=P, \ LL^= (((*(BF_LONG *)((unsigned char *)&(S[ 0])+((R>>BF_0)&BF_M))+ \ *(BF_LONG *)((unsigned char *)&(S[256])+((R>>BF_1)&BF_M)))^ \ *(BF_LONG *)((unsigned char *)&(S[512])+((R>>BF_2)&BF_M)))+ \ ! *(BF_LONG *)((unsigned char *)&(S[768])+((R<<BF_3)&BF_M))) \ ! ) #else ! /* ! * This is a *generic* version. Seem to perform best on platforms that ! * offer explicit support for extraction of 8-bit nibbles preferably ! * complemented with "multiplying" of array index by sizeof(BF_LONG). ! * For the moment of this writing the list comprises Alpha CPU featuring ! * extbl and s[48]addq instructions. ! */ ! #define BF_ENC(LL,R,S,P) ( \ ! LL^=P, \ ! LL^=((( S[ ((int)(R>>24)&0xff)] + \ ! S[0x0100+((int)(R>>16)&0xff)])^ \ ! S[0x0200+((int)(R>> 8)&0xff)])+ \ ! S[0x0300+((int)(R )&0xff)])&0xffffffffL \ ! ) #endif #endif *** ./crypto/bf/bf_enc.c.orig Tue Apr 20 00:00:16 1999 --- ./crypto/bf/bf_enc.c Mon Apr 26 18:36:43 1999 *************** *** 71,76 **** --- 71,77 ---- void BF_encrypt(BF_LONG *data, BF_KEY *key) { + #ifndef BF_PTR2 register BF_LONG l,r,*p,*s; p=key->P; *************** *** 105,110 **** --- 106,146 ---- data[1]=l&0xffffffffL; data[0]=r&0xffffffffL; + #else + register BF_LONG l,r,t,*k; + + l=data[0]; + r=data[1]; + k=(BF_LONG*)key; + + l^=k[0]; + BF_ENC(r,l,k, 1); + BF_ENC(l,r,k, 2); + BF_ENC(r,l,k, 3); + BF_ENC(l,r,k, 4); + BF_ENC(r,l,k, 5); + BF_ENC(l,r,k, 6); + BF_ENC(r,l,k, 7); + BF_ENC(l,r,k, 8); + BF_ENC(r,l,k, 9); + BF_ENC(l,r,k,10); + BF_ENC(r,l,k,11); + BF_ENC(l,r,k,12); + BF_ENC(r,l,k,13); + BF_ENC(l,r,k,14); + BF_ENC(r,l,k,15); + BF_ENC(l,r,k,16); + #if BF_ROUNDS == 20 + BF_ENC(r,l,k,17); + BF_ENC(l,r,k,18); + BF_ENC(r,l,k,19); + BF_ENC(l,r,k,20); + #endif + r^=k[BF_ROUNDS+1]; + + data[1]=l&0xffffffffL; + data[0]=r&0xffffffffL; + #endif } #ifndef BF_DEFAULT_OPTIONS *************** *** 111,116 **** --- 147,153 ---- void BF_decrypt(BF_LONG *data, BF_KEY *key) { + #ifndef BF_PTR2 register BF_LONG l,r,*p,*s; p=key->P; *************** *** 145,150 **** --- 182,222 ---- data[1]=l&0xffffffffL; data[0]=r&0xffffffffL; + #else + register BF_LONG l,r,t,*k; + + l=data[0]; + r=data[1]; + k=(BF_LONG *)key; + + l^=k[BF_ROUNDS+1]; + #if BF_ROUNDS == 20 + BF_ENC(r,l,k,20); + BF_ENC(l,r,k,19); + BF_ENC(r,l,k,18); + BF_ENC(l,r,k,17); + #endif + BF_ENC(r,l,k,16); + BF_ENC(l,r,k,15); + BF_ENC(r,l,k,14); + BF_ENC(l,r,k,13); + BF_ENC(r,l,k,12); + BF_ENC(l,r,k,11); + BF_ENC(r,l,k,10); + BF_ENC(l,r,k, 9); + BF_ENC(r,l,k, 8); + BF_ENC(l,r,k, 7); + BF_ENC(r,l,k, 6); + BF_ENC(l,r,k, 5); + BF_ENC(r,l,k, 4); + BF_ENC(l,r,k, 3); + BF_ENC(r,l,k, 2); + BF_ENC(l,r,k, 1); + r^=k[0]; + + data[1]=l&0xffffffffL; + data[0]=r&0xffffffffL; + #endif } void BF_cbc_encrypt(unsigned char *in, unsigned char *out, long length, *** ./crypto/opensslconf.h.in.orig Wed Apr 21 19:33:52 1999 --- ./crypto/opensslconf.h.in Mon Apr 26 19:28:29 1999 *************** *** 54,80 **** #if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H) #define CONFIG_HEADER_BF_LOCL_H ! /* Special defines which change the way the code is built depending on the ! CPU and OS. For SGI machines you can use _MIPS_SZLONG (32 or 64) to find ! even newer MIPS CPU's, but at the moment one size fits all for ! optimization options. Older Sparc's work better with only UNROLL, but ! there's no way to tell at compile time what it is you're running on */ ! ! #if defined( sun ) /* Newer Sparc's */ ! # define BF_PTR ! #elif defined( __ultrix ) /* Older MIPS */ ! # define BF_PTR ! #elif defined( __sgi ) /* Newer MIPS */ ! # define BF_PTR ! #endif /* Systems-specific speed defines */ ! ! /* use BF_PTR2 for intel boxes, ! * BF_PTR for sparc and MIPS/SGI ! * use nothing for Alpha and HP. ! */ ! #if !defined(BF_PTR) && !defined(BF_PTR2) ! #define BF_PTR2 ! #endif #endif /* HEADER_BF_LOCL_H */ #if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H) --- 54,60 ---- #if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H) #define CONFIG_HEADER_BF_LOCL_H ! #undef BF_PTR #endif /* HEADER_BF_LOCL_H */ #if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)