Hi,

I have made a patch against 5.7 that improves the speed of xor for amd64
by 1% (timed on a seperate userland program).   I tested the userland
program against an i386 and a amd64 host, didn't have access to any other
architectures.  

If a hardcore developer thinks this is worth it ... feel free to include
something similar to my patch.  The modes this affects is the CTR and the 
XTS AES modes, the latter being tested by me on my amd64 host with a encrypted
sparse file:

sd1 at scsibus3 targ 1 lun 0: <OPENBSD, SR CRYPTO, 005> SCSI2 0/direct fixed
sd1: 1023MB, 512 bytes/sector, 2096561 sectors

It worked so the function must be working.  I have attached my patch for 
review inline.  It goes against /sys/crypto/xform.c

-peter

--- xform.c.orig        Mon Jun  8 09:29:27 2015
+++ xform.c     Mon Jun  8 09:34:14 2015
@@ -106,6 +106,8 @@
 u_int32_t deflate_decompress(u_int8_t *, u_int32_t, u_int8_t **);
 u_int32_t lzs_dummy(u_int8_t *, u_int32_t, u_int8_t **);
 
+void xorfunc(u_int8_t *, u_int8_t *, int);
+
 #define AESCTR_NONCESIZE       4
 #define AESCTR_IVSIZE          8
 #define AESCTR_BLOCKSIZE       16
@@ -499,8 +501,11 @@
                if (++ctx->ac_block[i])   /* continue on overflow */
                        break;
        rijndaelEncrypt(ctx->ac_ek, ctx->ac_nr, ctx->ac_block, keystream);
+#if 0
        for (i = 0; i < AESCTR_BLOCKSIZE; i++)
                data[i] ^= keystream[i];
+#endif
+       xorfunc(data, keystream, AESCTR_BLOCKSIZE);
        explicit_bzero(keystream, sizeof(keystream));
 }
 
@@ -557,8 +562,11 @@
        else
                rijndael_decrypt(&ctx->key1, block, data);
 
+#if 0
        for (i = 0; i < AES_XTS_BLOCKSIZE; i++)
                data[i] ^= ctx->tweak[i];
+#endif
+       xorfunc(data, ctx->tweak, AES_XTS_BLOCKSIZE);
 
        /* Exponentiate tweak */
        carry_in = 0;
@@ -676,4 +684,27 @@
 {
        *out = NULL;
        return (0);
+}
+
+void
+xorfunc(u_int8_t *output, u_int8_t *input, int len)
+{
+        int i;
+#if __amd64__
+        u_int8_t *i0, *i1, *i2, *i3;
+        u_int8_t *o0, *o1, *o2, *o3;
+
+        for (i = 0; i < len; i += 4) {
+                i0 = (u_int8_t *)&input[0 + i]; i1=(u_int8_t *)&input[1 + i];
+                i2 = (u_int8_t *)&input[2 + i]; i3=(u_int8_t *)&input[3 + i];
+                o0 = (u_int8_t *)&output[0 + i]; o1=(u_int8_t *)&output[1 + i];
+                o2 = (u_int8_t *)&output[2 + i]; o3=(u_int8_t *)&output[3 + i];
+
+                *o0 ^= *i0; *o1 ^= *i1; *o2 ^= *i2; *o3 ^= *i3;
+        }
+#else
+        for (i = 0; i < len; i++) {
+                output[i] ^= input[i];
+        }
+#endif
 }

Reply via email to