Hello,
1) Firstly, I was surprised about current AES implementation timings. I
wrote the simplest test, containing calls to 4 core AES functions (
AES_set_encrypt_key, AES_encrypt(), AES_set_decrypt_key(), AES_decrypt)
and measured the clocks of some OpenSSL implementations. I used the x86
processors with no AES_NI support (AES_NI is faster anyway) and I used
the smallest AES block (16 bytes) and 256-bit key.
Here are the results (all files are taken from 1.0.1c), I've used Intel
C Compiler 11 with /O2 /arch:ia32 options and gcc 4.71 with -O3:
set_encr() encr.() set_decr() decr.()
1) aes_x86core.c icl 256 448 1530 521
gcc 367 559 1660 653
2) aes_core.c icl 275 398 908 385
(calls to private_) gcc 335 504 1169 508
3) /DAES_ASM (asm icl/gcc 242 673 1394 943
from aes-586.pl)
4) aes_x86core.c icl 253 448 856 517
(if 0 changed to gcc 368 543 1430 635
if 1)
5) the same + icl 253 402 856 407
#undef gcc 368 449 1430 527
AES_COMPACT_IN_OUTER_ROUNDS
6) patch (see below) icl 275 398 732 385
gcc 335 498 1090 508
It seems that almost all AES optimizations are generating worse code on
x86 platform than an old aes_core.c (that is just Rijndael optimized
reference code). ASM and aes_x86core.c versions are worse in all
functions, except of set_encrypt_key. As for encrypt() and decrypt()
function, this behavior could be possible explained that they are
optimized for large AES blocks, but I can't explain why
set_decrypt_key() implementation is so slow...
2) So I decided to improve AES_set_decrypt_key() and here is the patch.
The result of patched code compiled with icl is 732 cycles for 256-bit
key that is 15% faster that best current implementation. For gcc, result
is worse - 1086 cycles (8% speed-up). I also wrote an asm implementation
that is faster than icl for 128-bit keys only (patched C version does
550 cycles and an asm version does 475) and mush faster than gcc code
for any key length. Unfortunately, I was not able to make a patch for
aes-586.pl because my assembler version actively uses all 7 registers
and there is no more register to address constant tables Te1..4, Td1..4
etc. I address them like
mov ebp, DWORD PTR _Td2[ebp*4]
but in your code these tables are in the code segment and require one
more register.
If you're interested, Andy, I can send you this implementation.
--
SY / C4acT/\uBo Pavel Semjanov
_ _ _ http://www.semjanov.com
| | |-| |_|_| |-|
--- aes_core.c Tue Jun 21 20:23:42 2011
+++ aes_core-patched.c Wed Aug 29 00:42:38 2012
@@ -730,47 +730,277 @@
AES_KEY *key) {
u32 *rk;
- int i, j, status;
- u32 temp;
+ int i, j;
+ u32 temp0, temp1, temp2, temp3;
+ u32 temp4, temp5, temp6, temp7;
- /* first, start with an encryption schedule */
- status = private_AES_set_encrypt_key(userKey, bits, key);
- if (status < 0)
- return status;
-
- rk = key->rd_key;
-
- /* invert the order of the round keys: */
- for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
- temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
- temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
- temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
- temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+ if (!userKey || !key)
+ return -1;
+ if (bits != 128 && bits != 192 && bits != 256)
+ return -2;
+
+ key->rounds = 10 + (bits - 128) / 32;
+ rk = key->rd_key + key->rounds * 4;
+ i = 0;
+
+ temp0 = rk[0] = GETU32(userKey );
+ temp1 = rk[1] = GETU32(userKey + 4);
+ temp2 = rk[2] = GETU32(userKey + 8);
+ temp3 = rk[3] = GETU32(userKey + 12);
+ if (bits == 128) {
+ while (1) {
+ rk -= 4;
+ temp0 =
+ (Te2[(temp3 >> 16) & 0xff] & 0xff000000) ^
+ (Te3[(temp3 >> 8) & 0xff] & 0x00ff0000) ^
+ (Te0[(temp3 ) & 0xff] & 0x0000ff00) ^
+ (Te1[(temp3 >> 24) ] & 0x000000ff) ^
+ rcon[i] ^ temp0;
+
+ if (++i == 10) goto LAST_ROUND;
+
+ rk[0] =
+ Td0[Te1[(temp0 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp0 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp0 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp0 ) & 0xff] & 0xff];
+
+ temp1 = temp1 ^ temp0;
+ rk[1] =
+ Td0[Te1[(temp1 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp1 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp1 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp1 ) & 0xff] & 0xff];
+
+ temp2 = temp2 ^ temp1;
+ rk[2] =
+ Td0[Te1[(temp2 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp2 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp2 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp2 ) & 0xff] & 0xff];
+
+ temp3 = temp3 ^ temp2;
+ rk[3] =
+ Td0[Te1[(temp3 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp3 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp3 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp3 ) & 0xff] & 0xff];
+ }
}
- /* apply the inverse MixColumn transform to all round keys but the
first and the last: */
- for (i = 1; i < (key->rounds); i++) {
- rk += 4;
- rk[0] =
- Td0[Te1[(rk[0] >> 24) ] & 0xff] ^
- Td1[Te1[(rk[0] >> 16) & 0xff] & 0xff] ^
- Td2[Te1[(rk[0] >> 8) & 0xff] & 0xff] ^
- Td3[Te1[(rk[0] ) & 0xff] & 0xff];
- rk[1] =
- Td0[Te1[(rk[1] >> 24) ] & 0xff] ^
- Td1[Te1[(rk[1] >> 16) & 0xff] & 0xff] ^
- Td2[Te1[(rk[1] >> 8) & 0xff] & 0xff] ^
- Td3[Te1[(rk[1] ) & 0xff] & 0xff];
- rk[2] =
- Td0[Te1[(rk[2] >> 24) ] & 0xff] ^
- Td1[Te1[(rk[2] >> 16) & 0xff] & 0xff] ^
- Td2[Te1[(rk[2] >> 8) & 0xff] & 0xff] ^
- Td3[Te1[(rk[2] ) & 0xff] & 0xff];
- rk[3] =
- Td0[Te1[(rk[3] >> 24) ] & 0xff] ^
- Td1[Te1[(rk[3] >> 16) & 0xff] & 0xff] ^
- Td2[Te1[(rk[3] >> 8) & 0xff] & 0xff] ^
- Td3[Te1[(rk[3] ) & 0xff] & 0xff];
+ temp4 = GETU32(userKey + 16);
+ temp5 = GETU32(userKey + 20);
+// InvMixColumn for pre-last 2 rk
+ rk[-4] =
+ Td0[Te1[(temp4 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp4 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp4 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp4 ) & 0xff] & 0xff];
+ rk[-3] =
+ Td0[Te1[(temp5 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp5 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp5 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp5 ) & 0xff] & 0xff];
+
+ if (bits == 192) {
+ while (1) { // doing 2 rounds in while
+ rk -= 12;
+ temp0 = temp0 ^
+ (Te2[(temp5 >> 16) & 0xff] & 0xff000000) ^
+ (Te3[(temp5 >> 8) & 0xff] & 0x00ff0000) ^
+ (Te0[(temp5 ) & 0xff] & 0x0000ff00) ^
+ (Te1[(temp5 >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[10] =
+ Td0[Te1[(temp0 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp0 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp0 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp0 ) & 0xff] & 0xff];
+
+ temp1 = temp1 ^ temp0;
+ rk[11] =
+ Td0[Te1[(temp1 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp1 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp1 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp1 ) & 0xff] & 0xff];
+ temp2 = temp2 ^ temp1;
+ rk[4] =
+ Td0[Te1[(temp2 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp2 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp2 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp2 ) & 0xff] & 0xff];
+
+ temp3 = temp3 ^ temp2;
+ rk[5] =
+ Td0[Te1[(temp3 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp3 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp3 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp3 ) & 0xff] & 0xff];
+ ++i;
+ temp4 = temp4 ^ temp3;
+ rk[6] =
+ Td0[Te1[(temp4 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp4 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp4 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp4 ) & 0xff] & 0xff];
+
+ temp5 = temp5 ^ temp4;
+ rk[7] =
+ Td0[Te1[(temp5 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp5 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp5 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp5 ) & 0xff] & 0xff];
+
+ temp0 = temp0 ^
+ (Te2[(temp5 >> 16) & 0xff] & 0xff000000) ^
+ (Te3[(temp5 >> 8) & 0xff] & 0x00ff0000) ^
+ (Te0[(temp5 ) & 0xff] & 0x0000ff00) ^
+ (Te1[(temp5 >> 24) ] & 0x000000ff) ^
+ rcon[i];
+
+ if (i == 7) { // last round
+ goto LAST_ROUND;
+ }
+
+ rk[0] =
+ Td0[Te1[(temp0 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp0 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp0 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp0 ) & 0xff] & 0xff];
+
+ temp1 = temp1 ^ temp0;
+ rk[1] =
+ Td0[Te1[(temp1 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp1 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp1 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp1 ) & 0xff] & 0xff];
+
+ temp2 = temp2 ^ temp1;
+ rk[2] =
+ Td0[Te1[(temp2 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp2 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp2 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp2 ) & 0xff] & 0xff];
+
+ temp3 = temp3 ^ temp2;
+ rk[3] =
+ Td0[Te1[(temp3 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp3 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp3 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp3 ) & 0xff] & 0xff];
+ ++i;
+
+ temp4 = temp4 ^ temp3;
+ rk[-4] =
+ Td0[Te1[(temp4 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp4 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp4 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp4 ) & 0xff] & 0xff];
+
+ temp5 = temp5 ^ temp4;
+ rk[-3] =
+ Td0[Te1[(temp5 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp5 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp5 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp5 ) & 0xff] & 0xff];
+
+ }
}
+ temp6 = GETU32(userKey + 24);
+ temp7 = GETU32(userKey + 28);
+// InvMix for pre-last 4 rk
+ if (bits == 256) {
+ rk[-2] =
+ Td0[Te1[(temp6 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp6 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp6 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp6 ) & 0xff] & 0xff];
+ rk[-1] =
+ Td0[Te1[(temp7 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp7 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp7 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp7 ) & 0xff] & 0xff];
+
+ while (1) {
+ rk -= 8;
+ temp0 = temp0 ^
+ (Te2[(temp7 >> 16) & 0xff] & 0xff000000) ^
+ (Te3[(temp7 >> 8) & 0xff] & 0x00ff0000) ^
+ (Te0[(temp7 ) & 0xff] & 0x0000ff00) ^
+ (Te1[(temp7 >> 24) ] & 0x000000ff) ^
+ rcon[i];
+
+ if (i == 6) { // last round
+ goto LAST_ROUND;
+ }
+
+ rk[0] =
+ Td0[Te1[(temp0 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp0 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp0 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp0 ) & 0xff] & 0xff];
+
+ temp1 = temp1 ^ temp0;
+ rk[1] =
+ Td0[Te1[(temp1 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp1 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp1 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp1 ) & 0xff] & 0xff];
+
+ temp2 = temp2 ^ temp1;
+ rk[2] =
+ Td0[Te1[(temp2 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp2 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp2 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp2 ) & 0xff] & 0xff];
+
+ temp3 = temp3 ^ temp2;
+ rk[3] =
+ Td0[Te1[(temp3 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp3 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp3 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp3 ) & 0xff] & 0xff];
+
+ i++;
+ temp4 = temp4 ^
+ (Te2[(temp3 >> 24) ] & 0xff000000) ^
+ (Te3[(temp3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te0[(temp3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te1[(temp3 ) & 0xff] & 0x000000ff);
+ rk[-4] =
+ Td0[Te1[(temp4 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp4 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp4 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp4 ) & 0xff] & 0xff];
+
+ temp5 = temp5 ^ temp4;
+ rk[-3] =
+ Td0[Te1[(temp5 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp5 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp5 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp5 ) & 0xff] & 0xff];
+
+ temp6 = temp6 ^ temp5;
+ rk[-2] =
+ Td0[Te1[(temp6 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp6 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp6 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp6 ) & 0xff] & 0xff];
+
+ temp7 = temp7 ^ temp6;
+ rk[-1] =
+ Td0[Te1[(temp7 >> 24) ] & 0xff] ^
+ Td1[Te1[(temp7 >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(temp7 >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(temp7 ) & 0xff] & 0xff];
+
+ }
+ }
+LAST_ROUND:
+ rk[0] = temp0;
+ rk[1] = temp1 ^ rk[0];
+ rk[2] = temp2 ^ rk[1];
+ rk[3] = temp3 ^ rk[2];
+
return 0;
}