Hello,

1) Firstly, I was surprised about current AES implementation timings. I wrote the simplest test, containing calls to 4 core AES functions ( AES_set_encrypt_key, AES_encrypt(), AES_set_decrypt_key(), AES_decrypt) and measured the clocks of some OpenSSL implementations. I used the x86 processors with no AES_NI support (AES_NI is faster anyway) and I used the smallest AES block (16 bytes) and 256-bit key.

Here are the results (all files are taken from 1.0.1c), I've used Intel
C Compiler 11 with /O2 /arch:ia32 options and gcc 4.71 with -O3:

                        set_encr()  encr.() set_decr() decr.()
1) aes_x86core.c     icl        256     448     1530    521
                     gcc        367     559     1660    653             

2) aes_core.c        icl        275     398     908     385     
(calls to private_)  gcc        335     504     1169    508

3) /DAES_ASM (asm    icl/gcc    242     673     1394    943
from aes-586.pl)        

4) aes_x86core.c     icl        253     448     856     517
(if 0 changed to     gcc        368     543     1430    635     
 if 1)

5) the same +        icl        253     402     856     407
#undef               gcc        368     449     1430    527
AES_COMPACT_IN_OUTER_ROUNDS

6) patch (see below) icl        275     398     732     385
                     gcc        335     498     1090    508


It seems that almost all AES optimizations are generating worse code on x86 platform than an old aes_core.c (that is just Rijndael optimized reference code). ASM and aes_x86core.c versions are worse in all functions, except of set_encrypt_key. As for encrypt() and decrypt() function, this behavior could be possible explained that they are optimized for large AES blocks, but I can't explain why set_decrypt_key() implementation is so slow...

2) So I decided to improve AES_set_decrypt_key() and here is the patch.
The result of patched code compiled with icl is 732 cycles for 256-bit key that is 15% faster that best current implementation. For gcc, result is worse - 1086 cycles (8% speed-up). I also wrote an asm implementation that is faster than icl for 128-bit keys only (patched C version does 550 cycles and an asm version does 475) and mush faster than gcc code for any key length. Unfortunately, I was not able to make a patch for aes-586.pl because my assembler version actively uses all 7 registers and there is no more register to address constant tables Te1..4, Td1..4 etc. I address them like
mov     ebp,    DWORD PTR _Td2[ebp*4]
but in your code these tables are in the code segment and require one more register.
If you're interested, Andy, I can send you this implementation.

--

   SY / C4acT/\uBo             Pavel Semjanov
   _   _         _        http://www.semjanov.com
  | | |-| |_|_| |-|
--- aes_core.c  Tue Jun 21 20:23:42 2011
+++ aes_core-patched.c  Wed Aug 29 00:42:38 2012
@@ -730,47 +730,277 @@
                         AES_KEY *key) {
 
         u32 *rk;
-       int i, j, status;
-       u32 temp;
+       int i, j;
+       u32 temp0, temp1, temp2, temp3;
+       u32 temp4, temp5, temp6, temp7;
 
-       /* first, start with an encryption schedule */
-       status = private_AES_set_encrypt_key(userKey, bits, key);
-       if (status < 0)
-               return status;
-
-       rk = key->rd_key;
-
-       /* invert the order of the round keys: */
-       for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
-               temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
-               temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
-               temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
-               temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+       if (!userKey || !key)
+               return -1;
+       if (bits != 128 && bits != 192 && bits != 256)
+               return -2;
+
+       key->rounds = 10 + (bits - 128) / 32;
+       rk = key->rd_key + key->rounds * 4;
+       i = 0;
+
+       temp0 = rk[0] = GETU32(userKey      );
+       temp1 = rk[1] = GETU32(userKey +   4);  
+       temp2 = rk[2] = GETU32(userKey +   8);
+       temp3 = rk[3] = GETU32(userKey +  12);
+       if (bits == 128) {
+               while (1) {
+                       rk -= 4;
+                       temp0 = 
+                               (Te2[(temp3 >> 16) & 0xff] & 0xff000000) ^
+                               (Te3[(temp3 >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te0[(temp3      ) & 0xff] & 0x0000ff00) ^
+                               (Te1[(temp3 >> 24)       ] & 0x000000ff) ^
+                               rcon[i] ^ temp0;
+
+                       if (++i == 10)  goto LAST_ROUND;
+
+                       rk[0] =
+                       Td0[Te1[(temp0 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp0 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp0 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp0      ) & 0xff] & 0xff];
+
+                       temp1 = temp1 ^ temp0;
+                       rk[1] =
+                       Td0[Te1[(temp1 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp1 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp1 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp1      ) & 0xff] & 0xff];
+
+                       temp2 = temp2 ^ temp1;
+                       rk[2] =
+                       Td0[Te1[(temp2 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp2 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp2 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp2      ) & 0xff] & 0xff];
+
+                       temp3 = temp3 ^ temp2;
+                       rk[3] =
+                       Td0[Te1[(temp3 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp3 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp3 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp3      ) & 0xff] & 0xff];
+               }
        }
-       /* apply the inverse MixColumn transform to all round keys but the 
first and the last: */
-       for (i = 1; i < (key->rounds); i++) {
-               rk += 4;
-               rk[0] =
-                       Td0[Te1[(rk[0] >> 24)       ] & 0xff] ^
-                       Td1[Te1[(rk[0] >> 16) & 0xff] & 0xff] ^
-                       Td2[Te1[(rk[0] >>  8) & 0xff] & 0xff] ^
-                       Td3[Te1[(rk[0]      ) & 0xff] & 0xff];
-               rk[1] =
-                       Td0[Te1[(rk[1] >> 24)       ] & 0xff] ^
-                       Td1[Te1[(rk[1] >> 16) & 0xff] & 0xff] ^
-                       Td2[Te1[(rk[1] >>  8) & 0xff] & 0xff] ^
-                       Td3[Te1[(rk[1]      ) & 0xff] & 0xff];
-               rk[2] =
-                       Td0[Te1[(rk[2] >> 24)       ] & 0xff] ^
-                       Td1[Te1[(rk[2] >> 16) & 0xff] & 0xff] ^
-                       Td2[Te1[(rk[2] >>  8) & 0xff] & 0xff] ^
-                       Td3[Te1[(rk[2]      ) & 0xff] & 0xff];
-               rk[3] =
-                       Td0[Te1[(rk[3] >> 24)       ] & 0xff] ^
-                       Td1[Te1[(rk[3] >> 16) & 0xff] & 0xff] ^
-                       Td2[Te1[(rk[3] >>  8) & 0xff] & 0xff] ^
-                       Td3[Te1[(rk[3]      ) & 0xff] & 0xff];
+       temp4 = GETU32(userKey + 16);
+       temp5 = GETU32(userKey + 20);
+// InvMixColumn for pre-last 2 rk
+       rk[-4] =
+                       Td0[Te1[(temp4 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp4 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp4 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp4      ) & 0xff] & 0xff];
+       rk[-3] =
+                       Td0[Te1[(temp5 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp5 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp5 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp5      ) & 0xff] & 0xff];
+
+       if (bits == 192) {
+               while (1) {                     // doing 2 rounds in while
+                       rk -= 12;
+                       temp0 = temp0 ^
+                               (Te2[(temp5 >> 16) & 0xff] & 0xff000000) ^
+                               (Te3[(temp5 >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te0[(temp5      ) & 0xff] & 0x0000ff00) ^
+                               (Te1[(temp5 >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+                       rk[10] =
+                       Td0[Te1[(temp0 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp0 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp0 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp0      ) & 0xff] & 0xff];
+
+                       temp1 = temp1 ^ temp0;
+                       rk[11] =
+                       Td0[Te1[(temp1 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp1 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp1 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp1      ) & 0xff] & 0xff];
+                       temp2 = temp2 ^ temp1;
+                       rk[4] =
+                       Td0[Te1[(temp2 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp2 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp2 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp2      ) & 0xff] & 0xff];
+
+                       temp3 = temp3 ^ temp2;
+                       rk[5] =
+                       Td0[Te1[(temp3 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp3 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp3 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp3      ) & 0xff] & 0xff];
+                       ++i;
+                       temp4 = temp4 ^ temp3;
+                       rk[6] =
+                       Td0[Te1[(temp4 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp4 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp4 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp4      ) & 0xff] & 0xff];
+
+                       temp5 = temp5 ^ temp4;
+                       rk[7] =
+                       Td0[Te1[(temp5 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp5 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp5 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp5      ) & 0xff] & 0xff];
+
+                       temp0 = temp0 ^
+                               (Te2[(temp5 >> 16) & 0xff] & 0xff000000) ^
+                               (Te3[(temp5 >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te0[(temp5      ) & 0xff] & 0x0000ff00) ^
+                               (Te1[(temp5 >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+
+                       if (i == 7) { // last round
+                               goto LAST_ROUND;
+                       }
+
+                       rk[0] =
+                       Td0[Te1[(temp0 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp0 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp0 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp0      ) & 0xff] & 0xff];
+
+                       temp1 = temp1 ^ temp0;
+                       rk[1] =
+                       Td0[Te1[(temp1 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp1 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp1 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp1      ) & 0xff] & 0xff];
+
+                       temp2 = temp2 ^ temp1;
+                       rk[2] =
+                       Td0[Te1[(temp2 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp2 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp2 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp2      ) & 0xff] & 0xff];
+
+                       temp3 = temp3 ^ temp2;
+                       rk[3] =
+                       Td0[Te1[(temp3 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp3 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp3 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp3      ) & 0xff] & 0xff];
+                       ++i;
+
+                       temp4 = temp4 ^ temp3;
+                       rk[-4] =
+                       Td0[Te1[(temp4 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp4 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp4 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp4      ) & 0xff] & 0xff];
+
+                       temp5 = temp5 ^ temp4;
+                       rk[-3] =
+                       Td0[Te1[(temp5 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp5 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp5 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp5      ) & 0xff] & 0xff];
+                       
+               }
        }
+       temp6 = GETU32(userKey + 24);
+       temp7 = GETU32(userKey + 28);
+// InvMix for pre-last 4 rk
+       if (bits == 256) {
+       rk[-2] =
+                       Td0[Te1[(temp6 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp6 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp6 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp6      ) & 0xff] & 0xff];
+       rk[-1] =
+                       Td0[Te1[(temp7 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp7 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp7 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp7      ) & 0xff] & 0xff];
+
+               while (1) {
+                       rk -= 8;
+                       temp0 = temp0 ^
+                               (Te2[(temp7 >> 16) & 0xff] & 0xff000000) ^
+                               (Te3[(temp7 >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te0[(temp7      ) & 0xff] & 0x0000ff00) ^
+                               (Te1[(temp7 >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+
+                       if (i == 6) {   // last round
+                               goto LAST_ROUND;
+                       }
+
+                       rk[0] =
+                       Td0[Te1[(temp0 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp0 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp0 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp0      ) & 0xff] & 0xff];
+
+                       temp1 = temp1 ^ temp0;
+                       rk[1] =
+                       Td0[Te1[(temp1 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp1 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp1 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp1      ) & 0xff] & 0xff];
+
+                       temp2 = temp2 ^ temp1;
+                       rk[2] =
+                       Td0[Te1[(temp2 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp2 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp2 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp2      ) & 0xff] & 0xff];
+
+                       temp3 = temp3 ^ temp2;
+                       rk[3] =
+                       Td0[Te1[(temp3 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp3 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp3 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp3      ) & 0xff] & 0xff];
+                                       
+                       i++;
+                       temp4 = temp4 ^
+                               (Te2[(temp3 >> 24)       ] & 0xff000000) ^
+                               (Te3[(temp3 >> 16) & 0xff] & 0x00ff0000) ^
+                               (Te0[(temp3 >>  8) & 0xff] & 0x0000ff00) ^
+                               (Te1[(temp3      ) & 0xff] & 0x000000ff);
+                       rk[-4] =
+                       Td0[Te1[(temp4 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp4 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp4 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp4      ) & 0xff] & 0xff];
+
+                       temp5 = temp5 ^ temp4;
+                       rk[-3] =
+                       Td0[Te1[(temp5 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp5 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp5 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp5      ) & 0xff] & 0xff];
+
+                       temp6 = temp6 ^ temp5;
+                       rk[-2] =
+                       Td0[Te1[(temp6 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp6 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp6 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp6      ) & 0xff] & 0xff];
+
+                       temp7 = temp7 ^ temp6;
+                       rk[-1] =
+                       Td0[Te1[(temp7 >> 24)       ] & 0xff] ^
+                       Td1[Te1[(temp7 >> 16) & 0xff] & 0xff] ^
+                       Td2[Te1[(temp7 >>  8) & 0xff] & 0xff] ^
+                       Td3[Te1[(temp7      ) & 0xff] & 0xff];
+
+               }
+       }
+LAST_ROUND:
+       rk[0] = temp0;
+       rk[1] = temp1 ^ rk[0];
+       rk[2] = temp2 ^ rk[1];
+       rk[3] = temp3 ^ rk[2];
+
        return 0;
 }
 

Reply via email to