Re: LP64: blowfish woes

Andy Polyakov Mon, 26 Apr 1999 10:15:43 -0700

> > As someone pointed out there're machines with sizeof(int)==8 out there.
> > So I'd like to reserve some extra time for elaborating on the patch by
> > redefining BF_[M0-3] macros. If you allow I can also come up with some
> > alternative for #ifdef spaghetti in the beginning of
> > crypto/bf/bf_locl.org.
> 
> version from ftp://ftp.openssl.org/snapshot/openssl-SNAP-19990421.tar.gz
Find attached patch relative to the mentioned snapshot. Comment about
#ifdef spaghetti (that was moved from crypto/bf/bf_locl.org to
crypto/opensslconf.h.in by Ulf). I don't see any need for it, so I've
folded the whole mumbo-jumbo to #undef BF_PTR:-) Indeed, good optimizing
compiler should be perfectly able to deduct both BF_PTR and BF_PTR2
versions from the generic one. People with poor compilers (read gcc:-)
would have to experiment in either case and would come across the
comments in bf_locl.h...

BTW. Why doesn't one turbocharge it (well, probably other algorithms
used by Netscape more extensively should be the first target:-) by
passing and receiving data block to be en-/decrypted by value instead of
by reference? I mean like this:

BF_LONG_LONG BF_encrypt (BF_LONG l,BF_LONG r,BF_KEY *key)

Instead of this:

void BF_encrypt (BF_LONG *data,BF_KEY *key)

Well, it wouldn't make hell of a difference on Intel as arguments has to
be *written* into memory (stack or array) in either case, but on RISC it
could be a big hit! Or is nobody interested in anything but Intel as
always? And yes, I realize it could be a pain in the ass:-) Especially
receiving the result part...

> > SHA might need extra consideration too then...
Is coming soon. It will very likely be followed by a number of patches
to other digest algorithms. Idea is to clean up this #ifdef *_ENDIAN
mess, that apparently was used to mask the code that doesn't work on
64-bit little-endian platforms (a.k.a. Alpha-based).

Cheers. Andy.

*** ./crypto/bf/blowfish.h.orig Tue Apr 20 18:00:10 1999
--- ./crypto/bf/blowfish.h      Mon Apr 26 19:24:15 1999
***************
*** 66,73 ****
  #define BF_ENCRYPT    1
  #define BF_DECRYPT    0
  
! #ifdef WIN16
  #define BF_LONG unsigned long
  #else
  #define BF_LONG unsigned int
  #endif
--- 66,90 ----
  #define BF_ENCRYPT    1
  #define BF_DECRYPT    0
  
! /*
!  * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!  * ! BF_LONG has to be at least 32 bits wide. If it's wider, then !
!  * ! BF_LONG_LOG2 has to be defined along.                        !
!  * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!  */
! 
! #if defined(WIN16) || defined(__LP32__)
  #define BF_LONG unsigned long
+ #elif defined(_CRAY) || defined(__ILP64__)
+ #define BF_LONG unsigned long
+ #define BF_LONG_LOG2 3
+ /*
+  * _CRAY note. I could declare short, but I have no idea what impact
+  * does it have on performance on none-T3E machines. I could declare
+  * int, but at least on C90 sizeof(int) can be chosen at compile time.
+  * So I've chosen long...
+  *                                    <[EMAIL PROTECTED]>
+  */
  #else
  #define BF_LONG unsigned int
  #endif
*** ./crypto/bf/bf_locl.h.orig  Wed Apr 21 19:30:49 1999
--- ./crypto/bf/bf_locl.h       Mon Apr 26 19:21:39 1999
***************
*** 151,206 ****
  /* This is actually a big endian algorithm, the most significate byte
   * is used to lookup array 0 */
  
- #define BF_M  0x3fc
- #define BF_0  22L
- #define BF_1  14L
- #define BF_2   6L
- #define BF_3   2L /* left shift */
- 
  #if defined(BF_PTR2)
  
! /* This is basically a special pentium verson */
! #define BF_ENC(LL,R,S,P) \
!       { \
!       BF_LONG t,u,v; \
!       u=R>>BF_0; \
!       v=R>>BF_1; \
!       u&=BF_M; \
!       v&=BF_M; \
!       t=  *(BF_LONG *)((unsigned char *)&(S[  0])+u); \
!       u=R>>BF_2; \
!       t+= *(BF_LONG *)((unsigned char *)&(S[256])+v); \
!       v=R<<BF_3; \
!       u&=BF_M; \
!       v&=BF_M; \
!       t^= *(BF_LONG *)((unsigned char *)&(S[512])+u); \
!       LL^=P; \
!       t+= *(BF_LONG *)((unsigned char *)&(S[768])+v); \
!       LL^=t; \
!       }
  
  #elif defined(BF_PTR)
  
! /* This is normally very good */
  
! #define BF_ENC(LL,R,S,P) \
!       LL^=P; \
        LL^= (((*(BF_LONG *)((unsigned char *)&(S[  0])+((R>>BF_0)&BF_M))+ \
                *(BF_LONG *)((unsigned char *)&(S[256])+((R>>BF_1)&BF_M)))^ \
                *(BF_LONG *)((unsigned char *)&(S[512])+((R>>BF_2)&BF_M)))+ \
!               *(BF_LONG *)((unsigned char *)&(S[768])+((R<<BF_3)&BF_M)));
  #else
  
! /* This will always work, even on 64 bit machines and strangly enough,
!  * on the Alpha it is faster than the pointer versions (both 32 and 64
!  * versions of BF_LONG) */
  
! #define BF_ENC(LL,R,S,P) \
!       LL^=P; \
!       LL^=((( S[        (int)(R>>24L)      ] + \
!               S[0x0100+((int)(R>>16L)&0xff)])^ \
!               S[0x0200+((int)(R>> 8L)&0xff)])+ \
!               S[0x0300+((int)(R     )&0xff)])&0xffffffffL;
  #endif
  
  #endif
--- 151,219 ----
  /* This is actually a big endian algorithm, the most significate byte
   * is used to lookup array 0 */
  
  #if defined(BF_PTR2)
  
! /*
!  * This is basically a special Intel version. Point is that Intel
!  * doesn't have many registers, but offers a reach choice of addressing
!  * modes. So we spare some registers by directly traversing BF_KEY
!  * structure and hiring the most decorated addressing mode. The code
!  * generated by EGCS is *perfectly* competitive with assembler
!  * implementation!
!  */
! #define BF_ENC(LL,R,KEY,Pi) (\
!       LL^=KEY[Pi], \
!       t=  KEY[BF_ROUNDS+2 +   0 + ((R>>24)&0xFF)], \
!       t+= KEY[BF_ROUNDS+2 + 256 + ((R>>16)&0xFF)], \
!       t^= KEY[BF_ROUNDS+2 + 512 + ((R>>8 )&0xFF)], \
!       t+= KEY[BF_ROUNDS+2 + 768 + ((R    )&0xFF)], \
!       LL^=t \
!       )
  
  #elif defined(BF_PTR)
  
! #ifndef BF_LONG_LOG2
! #define BF_LONG_LOG2  2       /* default to BF_LONG being 32 bits */
! #endif
! #define BF_M  (0xFF<<BF_LONG_LOG2)
! #define BF_0  (24-BF_LONG_LOG2)
! #define BF_1  (16-BF_LONG_LOG2)
! #define BF_2  ( 8-BF_LONG_LOG2)
! #define BF_3  BF_LONG_LOG2 /* left shift */
  
! /*
!  * This is normally very good on RISC platforms where normally you
!  * have to explicitely "multiplicate" array index by sizeof(BF_LONG)
!  * in order to caclulate the effective address. This implementation
!  * excuses CPU from this extra work. Power[PC] uses should have most
!  * fun as (R>>BF_i)&BF_M gets folded into a single instruction, namely
!  * rlwinm. So let'em double-check if their compiler does it.
!  */
! 
! #define BF_ENC(LL,R,S,P) ( \
!       LL^=P, \
        LL^= (((*(BF_LONG *)((unsigned char *)&(S[  0])+((R>>BF_0)&BF_M))+ \
                *(BF_LONG *)((unsigned char *)&(S[256])+((R>>BF_1)&BF_M)))^ \
                *(BF_LONG *)((unsigned char *)&(S[512])+((R>>BF_2)&BF_M)))+ \
!               *(BF_LONG *)((unsigned char *)&(S[768])+((R<<BF_3)&BF_M))) \
!       )
  #else
  
! /*
!  * This is a *generic* version. Seem to perform best on platforms that
!  * offer explicit support for extraction of 8-bit nibbles preferably
!  * complemented with "multiplying" of array index by sizeof(BF_LONG).
!  * For the moment of this writing the list comprises Alpha CPU featuring
!  * extbl and s[48]addq instructions.
!  */
  
! #define BF_ENC(LL,R,S,P) ( \
!       LL^=P, \
!       LL^=((( S[       ((int)(R>>24)&0xff)] + \
!               S[0x0100+((int)(R>>16)&0xff)])^ \
!               S[0x0200+((int)(R>> 8)&0xff)])+ \
!               S[0x0300+((int)(R    )&0xff)])&0xffffffffL \
!       )
  #endif
  
  #endif
*** ./crypto/bf/bf_enc.c.orig   Tue Apr 20 00:00:16 1999
--- ./crypto/bf/bf_enc.c        Mon Apr 26 18:36:43 1999
***************
*** 71,76 ****
--- 71,77 ----
  
  void BF_encrypt(BF_LONG *data, BF_KEY *key)
        {
+ #ifndef BF_PTR2
        register BF_LONG l,r,*p,*s;
  
        p=key->P;
***************
*** 105,110 ****
--- 106,146 ----
  
        data[1]=l&0xffffffffL;
        data[0]=r&0xffffffffL;
+ #else
+       register BF_LONG l,r,t,*k;
+ 
+       l=data[0];
+       r=data[1];
+       k=(BF_LONG*)key;
+ 
+       l^=k[0];
+       BF_ENC(r,l,k, 1);
+       BF_ENC(l,r,k, 2);
+       BF_ENC(r,l,k, 3);
+       BF_ENC(l,r,k, 4);
+       BF_ENC(r,l,k, 5);
+       BF_ENC(l,r,k, 6);
+       BF_ENC(r,l,k, 7);
+       BF_ENC(l,r,k, 8);
+       BF_ENC(r,l,k, 9);
+       BF_ENC(l,r,k,10);
+       BF_ENC(r,l,k,11);
+       BF_ENC(l,r,k,12);
+       BF_ENC(r,l,k,13);
+       BF_ENC(l,r,k,14);
+       BF_ENC(r,l,k,15);
+       BF_ENC(l,r,k,16);
+ #if BF_ROUNDS == 20
+       BF_ENC(r,l,k,17);
+       BF_ENC(l,r,k,18);
+       BF_ENC(r,l,k,19);
+       BF_ENC(l,r,k,20);
+ #endif
+       r^=k[BF_ROUNDS+1];
+ 
+       data[1]=l&0xffffffffL;
+       data[0]=r&0xffffffffL;
+ #endif
        }
  
  #ifndef BF_DEFAULT_OPTIONS
***************
*** 111,116 ****
--- 147,153 ----
  
  void BF_decrypt(BF_LONG *data, BF_KEY *key)
        {
+ #ifndef BF_PTR2
        register BF_LONG l,r,*p,*s;
  
        p=key->P;
***************
*** 145,150 ****
--- 182,222 ----
  
        data[1]=l&0xffffffffL;
        data[0]=r&0xffffffffL;
+ #else
+       register BF_LONG l,r,t,*k;
+ 
+       l=data[0];
+       r=data[1];
+       k=(BF_LONG *)key;
+ 
+       l^=k[BF_ROUNDS+1];
+ #if BF_ROUNDS == 20
+       BF_ENC(r,l,k,20);
+       BF_ENC(l,r,k,19);
+       BF_ENC(r,l,k,18);
+       BF_ENC(l,r,k,17);
+ #endif
+       BF_ENC(r,l,k,16);
+       BF_ENC(l,r,k,15);
+       BF_ENC(r,l,k,14);
+       BF_ENC(l,r,k,13);
+       BF_ENC(r,l,k,12);
+       BF_ENC(l,r,k,11);
+       BF_ENC(r,l,k,10);
+       BF_ENC(l,r,k, 9);
+       BF_ENC(r,l,k, 8);
+       BF_ENC(l,r,k, 7);
+       BF_ENC(r,l,k, 6);
+       BF_ENC(l,r,k, 5);
+       BF_ENC(r,l,k, 4);
+       BF_ENC(l,r,k, 3);
+       BF_ENC(r,l,k, 2);
+       BF_ENC(l,r,k, 1);
+       r^=k[0];
+ 
+       data[1]=l&0xffffffffL;
+       data[0]=r&0xffffffffL;
+ #endif
        }
  
  void BF_cbc_encrypt(unsigned char *in, unsigned char *out, long length,
*** ./crypto/opensslconf.h.in.orig      Wed Apr 21 19:33:52 1999
--- ./crypto/opensslconf.h.in   Mon Apr 26 19:28:29 1999
***************
*** 54,80 ****
  
  #if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
  #define CONFIG_HEADER_BF_LOCL_H
! /* Special defines which change the way the code is built depending on the
!    CPU and OS.  For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
!    even newer MIPS CPU's, but at the moment one size fits all for
!    optimization options.  Older Sparc's work better with only UNROLL, but
!    there's no way to tell at compile time what it is you're running on */
! 
! #if defined( sun )                    /* Newer Sparc's */
! #  define BF_PTR
! #elif defined( __ultrix )     /* Older MIPS */
! #  define BF_PTR
! #elif defined( __sgi )                /* Newer MIPS */
! #  define BF_PTR
! #endif /* Systems-specific speed defines */
! 
! /* use BF_PTR2 for intel boxes,
!  * BF_PTR for sparc and MIPS/SGI
!  * use nothing for Alpha and HP.
!  */
! #if !defined(BF_PTR) && !defined(BF_PTR2)
! #define BF_PTR2
! #endif
  #endif /* HEADER_BF_LOCL_H */
  
  #if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
--- 54,60 ----
  
  #if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
  #define CONFIG_HEADER_BF_LOCL_H
! #undef BF_PTR
  #endif /* HEADER_BF_LOCL_H */
  
  #if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)

Re: LP64: blowfish woes

Reply via email to