Hi, I have made a patch against 5.7 that improves the speed of xor for amd64 by 1% (timed on a seperate userland program). I tested the userland program against an i386 and a amd64 host, didn't have access to any other architectures.
If a hardcore developer thinks this is worth it ... feel free to include something similar to my patch. The modes this affects is the CTR and the XTS AES modes, the latter being tested by me on my amd64 host with a encrypted sparse file: sd1 at scsibus3 targ 1 lun 0: <OPENBSD, SR CRYPTO, 005> SCSI2 0/direct fixed sd1: 1023MB, 512 bytes/sector, 2096561 sectors It worked so the function must be working. I have attached my patch for review inline. It goes against /sys/crypto/xform.c -peter --- xform.c.orig Mon Jun 8 09:29:27 2015 +++ xform.c Mon Jun 8 09:34:14 2015 @@ -106,6 +106,8 @@ u_int32_t deflate_decompress(u_int8_t *, u_int32_t, u_int8_t **); u_int32_t lzs_dummy(u_int8_t *, u_int32_t, u_int8_t **); +void xorfunc(u_int8_t *, u_int8_t *, int); + #define AESCTR_NONCESIZE 4 #define AESCTR_IVSIZE 8 #define AESCTR_BLOCKSIZE 16 @@ -499,8 +501,11 @@ if (++ctx->ac_block[i]) /* continue on overflow */ break; rijndaelEncrypt(ctx->ac_ek, ctx->ac_nr, ctx->ac_block, keystream); +#if 0 for (i = 0; i < AESCTR_BLOCKSIZE; i++) data[i] ^= keystream[i]; +#endif + xorfunc(data, keystream, AESCTR_BLOCKSIZE); explicit_bzero(keystream, sizeof(keystream)); } @@ -557,8 +562,11 @@ else rijndael_decrypt(&ctx->key1, block, data); +#if 0 for (i = 0; i < AES_XTS_BLOCKSIZE; i++) data[i] ^= ctx->tweak[i]; +#endif + xorfunc(data, ctx->tweak, AES_XTS_BLOCKSIZE); /* Exponentiate tweak */ carry_in = 0; @@ -676,4 +684,27 @@ { *out = NULL; return (0); +} + +void +xorfunc(u_int8_t *output, u_int8_t *input, int len) +{ + int i; +#if __amd64__ + u_int8_t *i0, *i1, *i2, *i3; + u_int8_t *o0, *o1, *o2, *o3; + + for (i = 0; i < len; i += 4) { + i0 = (u_int8_t *)&input[0 + i]; i1=(u_int8_t *)&input[1 + i]; + i2 = (u_int8_t *)&input[2 + i]; i3=(u_int8_t *)&input[3 + i]; + o0 = (u_int8_t *)&output[0 + i]; o1=(u_int8_t *)&output[1 + i]; + o2 = (u_int8_t *)&output[2 + i]; o3=(u_int8_t *)&output[3 + i]; + + *o0 ^= *i0; *o1 ^= *i1; *o2 ^= *i2; *o3 ^= *i3; + } +#else + for (i = 0; i < len; i++) { + output[i] ^= input[i]; + } +#endif }