Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-28 Thread Borislav Petkov
On Tue, Aug 28, 2012 at 12:17:43PM +0300, Jussi Kivilinna wrote:
> With this patch twofish-avx is faster than twofish-3way for 256, 1k
> and 8k tests.
> 
> sizeold-vs-new  new-vs-3way old-vs-3way
> ecb-enc ecb-dec ecb-enc ecb-dec ecb-enc ecb-dec
> 256 1.10x   1.11x   1.01x   1.01x   0.92x   0.91x
> 1k  1.11x   1.12x   1.08x   1.07x   0.97x   0.96x
> 8k  1.11x   1.13x   1.10x   1.08x   0.99x   0.97x

Not bad, that's 10ish percent improvement, after all.

Thanks.

-- 
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-28 Thread Jussi Kivilinna

Quoting Borislav Petkov :


On Wed, Aug 22, 2012 at 10:20:03PM +0300, Jussi Kivilinna wrote:
Actually it does look better, at least for encryption. Decryption  
had different

ordering for test, which appears to be bad on bulldozer as it is on
sandy-bridge.

So, yet another patch then :)


Here you go:


Thanks!

With this patch twofish-avx is faster than twofish-3way for 256, 1k  
and 8k tests.


sizeold-vs-new  new-vs-3way old-vs-3way
ecb-enc ecb-dec ecb-enc ecb-dec ecb-enc ecb-dec
256 1.10x   1.11x   1.01x   1.01x   0.92x   0.91x
1k  1.11x   1.12x   1.08x   1.07x   0.97x   0.96x
8k  1.11x   1.13x   1.10x   1.08x   0.99x   0.97x

-Jussi



[  153.736745]
[  153.736745] testing speed of async ecb(twofish) encryption
[  153.745806] test 0 (128 bit key, 16 byte blocks): 4832343  
operations in 1 seconds (77317488 bytes)
[  154.752525] test 1 (128 bit key, 64 byte blocks): 2049979  
operations in 1 seconds (131198656 bytes)
[  155.755195] test 2 (128 bit key, 256 byte blocks): 620439  
operations in 1 seconds (158832384 bytes)
[  156.761694] test 3 (128 bit key, 1024 byte blocks): 173900  
operations in 1 seconds (178073600 bytes)
[  157.768282] test 4 (128 bit key, 8192 byte blocks): 22366  
operations in 1 seconds (18372 bytes)
[  158.774815] test 5 (192 bit key, 16 byte blocks): 4850741  
operations in 1 seconds (77611856 bytes)
[  159.781498] test 6 (192 bit key, 64 byte blocks): 2046772  
operations in 1 seconds (130993408 bytes)
[  160.788163] test 7 (192 bit key, 256 byte blocks): 619915  
operations in 1 seconds (158698240 bytes)
[  161.794636] test 8 (192 bit key, 1024 byte blocks): 173442  
operations in 1 seconds (177604608 bytes)
[  162.801242] test 9 (192 bit key, 8192 byte blocks): 22083  
operations in 1 seconds (180903936 bytes)
[  163.807793] test 10 (256 bit key, 16 byte blocks): 4862951  
operations in 1 seconds (77807216 bytes)
[  164.814449] test 11 (256 bit key, 64 byte blocks): 2050036  
operations in 1 seconds (131202304 bytes)
[  165.821121] test 12 (256 bit key, 256 byte blocks): 620349  
operations in 1 seconds (158809344 bytes)
[  166.827621] test 13 (256 bit key, 1024 byte blocks): 173917  
operations in 1 seconds (178091008 bytes)
[  167.834218] test 14 (256 bit key, 8192 byte blocks): 22362  
operations in 1 seconds (183189504 bytes)

[  168.840798]
[  168.840798] testing speed of async ecb(twofish) decryption
[  168.849968] test 0 (128 bit key, 16 byte blocks): 4889899  
operations in 1 seconds (78238384 bytes)
[  169.855439] test 1 (128 bit key, 64 byte blocks): 2052293  
operations in 1 seconds (131346752 bytes)
[  170.862113] test 2 (128 bit key, 256 byte blocks): 616979  
operations in 1 seconds (157946624 bytes)
[  171.868631] test 3 (128 bit key, 1024 byte blocks): 172773  
operations in 1 seconds (176919552 bytes)
[  172.875244] test 4 (128 bit key, 8192 byte blocks): 4  
operations in 1 seconds (182059008 bytes)
[  173.881777] test 5 (192 bit key, 16 byte blocks): 4893653  
operations in 1 seconds (78298448 bytes)
[  174.888451] test 6 (192 bit key, 64 byte blocks): 2048078  
operations in 1 seconds (131076992 bytes)
[  175.895131] test 7 (192 bit key, 256 byte blocks): 619204  
operations in 1 seconds (158516224 bytes)
[  176.901651] test 8 (192 bit key, 1024 byte blocks): 172569  
operations in 1 seconds (176710656 bytes)
[  177.908253] test 9 (192 bit key, 8192 byte blocks): 21888  
operations in 1 seconds (179306496 bytes)
[  178.914781] test 10 (256 bit key, 16 byte blocks): 4921751  
operations in 1 seconds (78748016 bytes)
[  179.917481] test 11 (256 bit key, 64 byte blocks): 2051219  
operations in 1 seconds (131278016 bytes)
[  180.920147] test 12 (256 bit key, 256 byte blocks): 618536  
operations in 1 seconds (158345216 bytes)
[  181.926637] test 13 (256 bit key, 1024 byte blocks): 172886  
operations in 1 seconds (177035264 bytes)
[  182.933249] test 14 (256 bit key, 8192 byte blocks): 2  
operations in 1 seconds (182042624 bytes)

[  183.939803]
[  183.939803] testing speed of async cbc(twofish) encryption
[  183.953902] test 0 (128 bit key, 16 byte blocks): 5195403  
operations in 1 seconds (83126448 bytes)
[  184.962487] test 1 (128 bit key, 64 byte blocks): 1912010  
operations in 1 seconds (122368640 bytes)
[  185.969150] test 2 (128 bit key, 256 byte blocks): 540125  
operations in 1 seconds (138272000 bytes)
[  186.975650] test 3 (128 bit key, 1024 byte blocks): 140631  
operations in 1 seconds (144006144 bytes)
[  187.982411] test 4 (128 bit key, 8192 byte blocks): 17737  
operations in 1 seconds (145301504 bytes)
[  188.988782] test 5 (192 bit key, 16 byte blocks): 5182287  
operations in 1 seconds (82916592 bytes)
[  189.995435] test 6 (192 bit key, 64 byte blocks): 1912356  
operations in 1 seconds (122390784 bytes)
[  191.002093] test 7 (192 bit key, 256 byte blocks): 540991  
operations in 1 seconds (138493696 bytes)
[  192.008600] test 8 (192 bit key, 1024 byte blocks): 140791  
operations in 1 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-28 Thread Borislav Petkov
On Tue, Aug 28, 2012 at 12:17:43PM +0300, Jussi Kivilinna wrote:
 With this patch twofish-avx is faster than twofish-3way for 256, 1k
 and 8k tests.
 
 sizeold-vs-new  new-vs-3way old-vs-3way
 ecb-enc ecb-dec ecb-enc ecb-dec ecb-enc ecb-dec
 256 1.10x   1.11x   1.01x   1.01x   0.92x   0.91x
 1k  1.11x   1.12x   1.08x   1.07x   0.97x   0.96x
 8k  1.11x   1.13x   1.10x   1.08x   0.99x   0.97x

Not bad, that's 10ish percent improvement, after all.

Thanks.

-- 
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-28 Thread Jussi Kivilinna

Quoting Borislav Petkov b...@alien8.de:


On Wed, Aug 22, 2012 at 10:20:03PM +0300, Jussi Kivilinna wrote:
Actually it does look better, at least for encryption. Decryption  
had different

ordering for test, which appears to be bad on bulldozer as it is on
sandy-bridge.

So, yet another patch then :)


Here you go:


Thanks!

With this patch twofish-avx is faster than twofish-3way for 256, 1k  
and 8k tests.


sizeold-vs-new  new-vs-3way old-vs-3way
ecb-enc ecb-dec ecb-enc ecb-dec ecb-enc ecb-dec
256 1.10x   1.11x   1.01x   1.01x   0.92x   0.91x
1k  1.11x   1.12x   1.08x   1.07x   0.97x   0.96x
8k  1.11x   1.13x   1.10x   1.08x   0.99x   0.97x

-Jussi



[  153.736745]
[  153.736745] testing speed of async ecb(twofish) encryption
[  153.745806] test 0 (128 bit key, 16 byte blocks): 4832343  
operations in 1 seconds (77317488 bytes)
[  154.752525] test 1 (128 bit key, 64 byte blocks): 2049979  
operations in 1 seconds (131198656 bytes)
[  155.755195] test 2 (128 bit key, 256 byte blocks): 620439  
operations in 1 seconds (158832384 bytes)
[  156.761694] test 3 (128 bit key, 1024 byte blocks): 173900  
operations in 1 seconds (178073600 bytes)
[  157.768282] test 4 (128 bit key, 8192 byte blocks): 22366  
operations in 1 seconds (18372 bytes)
[  158.774815] test 5 (192 bit key, 16 byte blocks): 4850741  
operations in 1 seconds (77611856 bytes)
[  159.781498] test 6 (192 bit key, 64 byte blocks): 2046772  
operations in 1 seconds (130993408 bytes)
[  160.788163] test 7 (192 bit key, 256 byte blocks): 619915  
operations in 1 seconds (158698240 bytes)
[  161.794636] test 8 (192 bit key, 1024 byte blocks): 173442  
operations in 1 seconds (177604608 bytes)
[  162.801242] test 9 (192 bit key, 8192 byte blocks): 22083  
operations in 1 seconds (180903936 bytes)
[  163.807793] test 10 (256 bit key, 16 byte blocks): 4862951  
operations in 1 seconds (77807216 bytes)
[  164.814449] test 11 (256 bit key, 64 byte blocks): 2050036  
operations in 1 seconds (131202304 bytes)
[  165.821121] test 12 (256 bit key, 256 byte blocks): 620349  
operations in 1 seconds (158809344 bytes)
[  166.827621] test 13 (256 bit key, 1024 byte blocks): 173917  
operations in 1 seconds (178091008 bytes)
[  167.834218] test 14 (256 bit key, 8192 byte blocks): 22362  
operations in 1 seconds (183189504 bytes)

[  168.840798]
[  168.840798] testing speed of async ecb(twofish) decryption
[  168.849968] test 0 (128 bit key, 16 byte blocks): 4889899  
operations in 1 seconds (78238384 bytes)
[  169.855439] test 1 (128 bit key, 64 byte blocks): 2052293  
operations in 1 seconds (131346752 bytes)
[  170.862113] test 2 (128 bit key, 256 byte blocks): 616979  
operations in 1 seconds (157946624 bytes)
[  171.868631] test 3 (128 bit key, 1024 byte blocks): 172773  
operations in 1 seconds (176919552 bytes)
[  172.875244] test 4 (128 bit key, 8192 byte blocks): 4  
operations in 1 seconds (182059008 bytes)
[  173.881777] test 5 (192 bit key, 16 byte blocks): 4893653  
operations in 1 seconds (78298448 bytes)
[  174.888451] test 6 (192 bit key, 64 byte blocks): 2048078  
operations in 1 seconds (131076992 bytes)
[  175.895131] test 7 (192 bit key, 256 byte blocks): 619204  
operations in 1 seconds (158516224 bytes)
[  176.901651] test 8 (192 bit key, 1024 byte blocks): 172569  
operations in 1 seconds (176710656 bytes)
[  177.908253] test 9 (192 bit key, 8192 byte blocks): 21888  
operations in 1 seconds (179306496 bytes)
[  178.914781] test 10 (256 bit key, 16 byte blocks): 4921751  
operations in 1 seconds (78748016 bytes)
[  179.917481] test 11 (256 bit key, 64 byte blocks): 2051219  
operations in 1 seconds (131278016 bytes)
[  180.920147] test 12 (256 bit key, 256 byte blocks): 618536  
operations in 1 seconds (158345216 bytes)
[  181.926637] test 13 (256 bit key, 1024 byte blocks): 172886  
operations in 1 seconds (177035264 bytes)
[  182.933249] test 14 (256 bit key, 8192 byte blocks): 2  
operations in 1 seconds (182042624 bytes)

[  183.939803]
[  183.939803] testing speed of async cbc(twofish) encryption
[  183.953902] test 0 (128 bit key, 16 byte blocks): 5195403  
operations in 1 seconds (83126448 bytes)
[  184.962487] test 1 (128 bit key, 64 byte blocks): 1912010  
operations in 1 seconds (122368640 bytes)
[  185.969150] test 2 (128 bit key, 256 byte blocks): 540125  
operations in 1 seconds (138272000 bytes)
[  186.975650] test 3 (128 bit key, 1024 byte blocks): 140631  
operations in 1 seconds (144006144 bytes)
[  187.982411] test 4 (128 bit key, 8192 byte blocks): 17737  
operations in 1 seconds (145301504 bytes)
[  188.988782] test 5 (192 bit key, 16 byte blocks): 5182287  
operations in 1 seconds (82916592 bytes)
[  189.995435] test 6 (192 bit key, 64 byte blocks): 1912356  
operations in 1 seconds (122390784 bytes)
[  191.002093] test 7 (192 bit key, 256 byte blocks): 540991  
operations in 1 seconds (138493696 bytes)
[  192.008600] test 8 (192 bit key, 1024 byte blocks): 140791  

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-23 Thread Borislav Petkov
On Wed, Aug 22, 2012 at 10:20:03PM +0300, Jussi Kivilinna wrote:
> Actually it does look better, at least for encryption. Decryption had 
> different
> ordering for test, which appears to be bad on bulldozer as it is on
> sandy-bridge.
> 
> So, yet another patch then :)

Here you go:

[  153.736745] 
[  153.736745] testing speed of async ecb(twofish) encryption
[  153.745806] test 0 (128 bit key, 16 byte blocks): 4832343 operations in 1 
seconds (77317488 bytes)
[  154.752525] test 1 (128 bit key, 64 byte blocks): 2049979 operations in 1 
seconds (131198656 bytes)
[  155.755195] test 2 (128 bit key, 256 byte blocks): 620439 operations in 1 
seconds (158832384 bytes)
[  156.761694] test 3 (128 bit key, 1024 byte blocks): 173900 operations in 1 
seconds (178073600 bytes)
[  157.768282] test 4 (128 bit key, 8192 byte blocks): 22366 operations in 1 
seconds (18372 bytes)
[  158.774815] test 5 (192 bit key, 16 byte blocks): 4850741 operations in 1 
seconds (77611856 bytes)
[  159.781498] test 6 (192 bit key, 64 byte blocks): 2046772 operations in 1 
seconds (130993408 bytes)
[  160.788163] test 7 (192 bit key, 256 byte blocks): 619915 operations in 1 
seconds (158698240 bytes)
[  161.794636] test 8 (192 bit key, 1024 byte blocks): 173442 operations in 1 
seconds (177604608 bytes)
[  162.801242] test 9 (192 bit key, 8192 byte blocks): 22083 operations in 1 
seconds (180903936 bytes)
[  163.807793] test 10 (256 bit key, 16 byte blocks): 4862951 operations in 1 
seconds (77807216 bytes)
[  164.814449] test 11 (256 bit key, 64 byte blocks): 2050036 operations in 1 
seconds (131202304 bytes)
[  165.821121] test 12 (256 bit key, 256 byte blocks): 620349 operations in 1 
seconds (158809344 bytes)
[  166.827621] test 13 (256 bit key, 1024 byte blocks): 173917 operations in 1 
seconds (178091008 bytes)
[  167.834218] test 14 (256 bit key, 8192 byte blocks): 22362 operations in 1 
seconds (183189504 bytes)
[  168.840798] 
[  168.840798] testing speed of async ecb(twofish) decryption
[  168.849968] test 0 (128 bit key, 16 byte blocks): 4889899 operations in 1 
seconds (78238384 bytes)
[  169.855439] test 1 (128 bit key, 64 byte blocks): 2052293 operations in 1 
seconds (131346752 bytes)
[  170.862113] test 2 (128 bit key, 256 byte blocks): 616979 operations in 1 
seconds (157946624 bytes)
[  171.868631] test 3 (128 bit key, 1024 byte blocks): 172773 operations in 1 
seconds (176919552 bytes)
[  172.875244] test 4 (128 bit key, 8192 byte blocks): 4 operations in 1 
seconds (182059008 bytes)
[  173.881777] test 5 (192 bit key, 16 byte blocks): 4893653 operations in 1 
seconds (78298448 bytes)
[  174.888451] test 6 (192 bit key, 64 byte blocks): 2048078 operations in 1 
seconds (131076992 bytes)
[  175.895131] test 7 (192 bit key, 256 byte blocks): 619204 operations in 1 
seconds (158516224 bytes)
[  176.901651] test 8 (192 bit key, 1024 byte blocks): 172569 operations in 1 
seconds (176710656 bytes)
[  177.908253] test 9 (192 bit key, 8192 byte blocks): 21888 operations in 1 
seconds (179306496 bytes)
[  178.914781] test 10 (256 bit key, 16 byte blocks): 4921751 operations in 1 
seconds (78748016 bytes)
[  179.917481] test 11 (256 bit key, 64 byte blocks): 2051219 operations in 1 
seconds (131278016 bytes)
[  180.920147] test 12 (256 bit key, 256 byte blocks): 618536 operations in 1 
seconds (158345216 bytes)
[  181.926637] test 13 (256 bit key, 1024 byte blocks): 172886 operations in 1 
seconds (177035264 bytes)
[  182.933249] test 14 (256 bit key, 8192 byte blocks): 2 operations in 1 
seconds (182042624 bytes)
[  183.939803] 
[  183.939803] testing speed of async cbc(twofish) encryption
[  183.953902] test 0 (128 bit key, 16 byte blocks): 5195403 operations in 1 
seconds (83126448 bytes)
[  184.962487] test 1 (128 bit key, 64 byte blocks): 1912010 operations in 1 
seconds (122368640 bytes)
[  185.969150] test 2 (128 bit key, 256 byte blocks): 540125 operations in 1 
seconds (138272000 bytes)
[  186.975650] test 3 (128 bit key, 1024 byte blocks): 140631 operations in 1 
seconds (144006144 bytes)
[  187.982411] test 4 (128 bit key, 8192 byte blocks): 17737 operations in 1 
seconds (145301504 bytes)
[  188.988782] test 5 (192 bit key, 16 byte blocks): 5182287 operations in 1 
seconds (82916592 bytes)
[  189.995435] test 6 (192 bit key, 64 byte blocks): 1912356 operations in 1 
seconds (122390784 bytes)
[  191.002093] test 7 (192 bit key, 256 byte blocks): 540991 operations in 1 
seconds (138493696 bytes)
[  192.008600] test 8 (192 bit key, 1024 byte blocks): 140791 operations in 1 
seconds (144169984 bytes)
[  193.015197] test 9 (192 bit key, 8192 byte blocks): 17609 operations in 1 
seconds (144252928 bytes)
[  194.021740] test 10 (256 bit key, 16 byte blocks): 5191521 operations in 1 
seconds (83064336 bytes)
[  195.028534] test 11 (256 bit key, 64 byte blocks): 1906226 operations in 1 
seconds (121998464 bytes)
[  196.035069] test 12 (256 bit key, 256 byte blocks): 540479 operations in 1 
seconds (138362624 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-23 Thread Jussi Kivilinna

Quoting Jason Garrett-Glaser :


On Wed, Aug 22, 2012 at 12:20 PM, Jussi Kivilinna
 wrote:

Quoting Borislav Petkov :


On Wed, Aug 22, 2012 at 07:35:12AM +0300, Jussi Kivilinna wrote:

Looks that encryption lost ~0.4% while decryption gained ~1.8%.

For 256 byte test, it's still slightly slower than twofish-3way
(~3%). For 1k
and 8k tests, it's ~5% faster.

Here's very last test-patch, testing different ordering of fpu<->cpu reg
instructions at few places.


Hehe,.

I don't mind testing patches, no worries there. Here are the results
this time, doesn't look better than the last run, AFAICT.



Actually it does look better, at least for encryption. Decryption  
had different

ordering for test, which appears to be bad on bulldozer as it is on
sandy-bridge.

So, yet another patch then :)

Interleaving at some new places (reordered lookup_32bit()s in G-macro) and
doing one of the round rotations one round ahead. Also introduces some
more paralellism inside lookup_32bit.


Outsider looking in here, but avoiding the 256-way lookup tables
entirely might be faster.  Looking at the twofish code, one byte-wise
calculation looks like this:

a0 = x >> 4; b0 = x & 15;
a1 = a0 ^ b0; b1 = ror4[b0] ^ ashx[a0];
a2 = qt0[n][a1]; b2 = qt1[n][b1];
a3 = a2 ^ b2; b3 = ror4[b2] ^ ashx[a2];
a4 = qt2[n][a3]; b4 = qt3[n][b3];
return (b4 << 4) | a4;

This means that you can do something like this pseudocode (Intel
syntax).  pshufb on ymm registers is AVX2, but splitting it into xmm
operations would probably be fine (as would using this for just a pure
SSE implementation!).  On AVX2 you' have to double the tables for both
ways, naturally.

constants:
pb_0x0f = {0x0f,0x0f,0x0f ... }
ashx: lookup table
ror4: lookup table
qt0[n]: lookup table
qt1[n]: lookup table
qt2[n]: lookup table
qt3[n]: lookup table

vpandb0, in, pb_0x0f
vpsrlw   a0, in, 4
vpanda0, a0, pb_0x0f; effectively vpsrlb, but that doesn't exist

vpxora1, a0, b0
vpshufb  a0,   ashx, a0
vpshufb  b0,   ror4, b0
vpxorb1, a0, b0

vpshufb  a2, qt0[n], a1
vpshufb  b2, qt1[n], b1

vpxora3, a2, b2
vpshufb  a3,   ashx, a2
vpshufb  b3,   ror4, b2
vpxorb3, a2, b2

vpshufb  a4, qt2[n], a3
vpshufb  b4, qt3[n], b3

vpsllw   b4, b4, 4  ; effectively vpsrlb, but that doesn't exist
vporout, a4, b4

That's 15 instructions (plus maybe a move or two) to do 16 lookups for
SSE (~9 cycles by my guessing on a Nehalem).  AVX would run into the
problem of lots of extra vinsert/vextract (just going 16-byte might be
better, might be not, depending on execution units).  AVX2 would be
super fast (15 for 32).

If this works, this could be quite a bit faster with the table-based  
approach.


The above would implement twofish permutations q0 and q1? For  
byte-sliced implementation you would need 8 parallel blocks (16b  
registers, two parallel h-functions for round, 16/2).


In this setup, for double h-function, you need 12 q0/1 operations (for  
128bit key, for 192bit: 16, for 256bit: 20), plus 8 key material xors  
(for 192bit 12, 256bit 16) and MDS matrix multiplication (alot more  
than 15 instructions, I'd think). We do 16-rounds so that gives us,  
((12*15+8+15)*16)/(8*16) > 25.3 cycles/byte. Usually I get ~2.5  
instructions/cycle for pure SSE2, so that's 10 cycles/byte.


After that we have PHT phase. But now problem is that PHT base uses  
32-bit additions, so either we move between byte-sliced and  
dword-sliced modes here or move addition carry over bytes. After PHT  
there is 32-bit addition with key material and 32-bit rotations.


I don't think this is going to work. For AVX2, vpgatherdd is going to  
speed up 32-bit lookups anyway.


-Jussi



Jason






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-23 Thread Jussi Kivilinna

Quoting Jason Garrett-Glaser ja...@x264.com:


On Wed, Aug 22, 2012 at 12:20 PM, Jussi Kivilinna
jussi.kivili...@mbnet.fi wrote:

Quoting Borislav Petkov b...@alien8.de:


On Wed, Aug 22, 2012 at 07:35:12AM +0300, Jussi Kivilinna wrote:

Looks that encryption lost ~0.4% while decryption gained ~1.8%.

For 256 byte test, it's still slightly slower than twofish-3way
(~3%). For 1k
and 8k tests, it's ~5% faster.

Here's very last test-patch, testing different ordering of fpu-cpu reg
instructions at few places.


Hehe,.

I don't mind testing patches, no worries there. Here are the results
this time, doesn't look better than the last run, AFAICT.



Actually it does look better, at least for encryption. Decryption  
had different

ordering for test, which appears to be bad on bulldozer as it is on
sandy-bridge.

So, yet another patch then :)

Interleaving at some new places (reordered lookup_32bit()s in G-macro) and
doing one of the round rotations one round ahead. Also introduces some
more paralellism inside lookup_32bit.


Outsider looking in here, but avoiding the 256-way lookup tables
entirely might be faster.  Looking at the twofish code, one byte-wise
calculation looks like this:

a0 = x  4; b0 = x  15;
a1 = a0 ^ b0; b1 = ror4[b0] ^ ashx[a0];
a2 = qt0[n][a1]; b2 = qt1[n][b1];
a3 = a2 ^ b2; b3 = ror4[b2] ^ ashx[a2];
a4 = qt2[n][a3]; b4 = qt3[n][b3];
return (b4  4) | a4;

This means that you can do something like this pseudocode (Intel
syntax).  pshufb on ymm registers is AVX2, but splitting it into xmm
operations would probably be fine (as would using this for just a pure
SSE implementation!).  On AVX2 you' have to double the tables for both
ways, naturally.

constants:
pb_0x0f = {0x0f,0x0f,0x0f ... }
ashx: lookup table
ror4: lookup table
qt0[n]: lookup table
qt1[n]: lookup table
qt2[n]: lookup table
qt3[n]: lookup table

vpandb0, in, pb_0x0f
vpsrlw   a0, in, 4
vpanda0, a0, pb_0x0f; effectively vpsrlb, but that doesn't exist

vpxora1, a0, b0
vpshufb  a0,   ashx, a0
vpshufb  b0,   ror4, b0
vpxorb1, a0, b0

vpshufb  a2, qt0[n], a1
vpshufb  b2, qt1[n], b1

vpxora3, a2, b2
vpshufb  a3,   ashx, a2
vpshufb  b3,   ror4, b2
vpxorb3, a2, b2

vpshufb  a4, qt2[n], a3
vpshufb  b4, qt3[n], b3

vpsllw   b4, b4, 4  ; effectively vpsrlb, but that doesn't exist
vporout, a4, b4

That's 15 instructions (plus maybe a move or two) to do 16 lookups for
SSE (~9 cycles by my guessing on a Nehalem).  AVX would run into the
problem of lots of extra vinsert/vextract (just going 16-byte might be
better, might be not, depending on execution units).  AVX2 would be
super fast (15 for 32).

If this works, this could be quite a bit faster with the table-based  
approach.


The above would implement twofish permutations q0 and q1? For  
byte-sliced implementation you would need 8 parallel blocks (16b  
registers, two parallel h-functions for round, 16/2).


In this setup, for double h-function, you need 12 q0/1 operations (for  
128bit key, for 192bit: 16, for 256bit: 20), plus 8 key material xors  
(for 192bit 12, 256bit 16) and MDS matrix multiplication (alot more  
than 15 instructions, I'd think). We do 16-rounds so that gives us,  
((12*15+8+15)*16)/(8*16)  25.3 cycles/byte. Usually I get ~2.5  
instructions/cycle for pure SSE2, so that's 10 cycles/byte.


After that we have PHT phase. But now problem is that PHT base uses  
32-bit additions, so either we move between byte-sliced and  
dword-sliced modes here or move addition carry over bytes. After PHT  
there is 32-bit addition with key material and 32-bit rotations.


I don't think this is going to work. For AVX2, vpgatherdd is going to  
speed up 32-bit lookups anyway.


-Jussi



Jason






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-23 Thread Borislav Petkov
On Wed, Aug 22, 2012 at 10:20:03PM +0300, Jussi Kivilinna wrote:
 Actually it does look better, at least for encryption. Decryption had 
 different
 ordering for test, which appears to be bad on bulldozer as it is on
 sandy-bridge.
 
 So, yet another patch then :)

Here you go:

[  153.736745] 
[  153.736745] testing speed of async ecb(twofish) encryption
[  153.745806] test 0 (128 bit key, 16 byte blocks): 4832343 operations in 1 
seconds (77317488 bytes)
[  154.752525] test 1 (128 bit key, 64 byte blocks): 2049979 operations in 1 
seconds (131198656 bytes)
[  155.755195] test 2 (128 bit key, 256 byte blocks): 620439 operations in 1 
seconds (158832384 bytes)
[  156.761694] test 3 (128 bit key, 1024 byte blocks): 173900 operations in 1 
seconds (178073600 bytes)
[  157.768282] test 4 (128 bit key, 8192 byte blocks): 22366 operations in 1 
seconds (18372 bytes)
[  158.774815] test 5 (192 bit key, 16 byte blocks): 4850741 operations in 1 
seconds (77611856 bytes)
[  159.781498] test 6 (192 bit key, 64 byte blocks): 2046772 operations in 1 
seconds (130993408 bytes)
[  160.788163] test 7 (192 bit key, 256 byte blocks): 619915 operations in 1 
seconds (158698240 bytes)
[  161.794636] test 8 (192 bit key, 1024 byte blocks): 173442 operations in 1 
seconds (177604608 bytes)
[  162.801242] test 9 (192 bit key, 8192 byte blocks): 22083 operations in 1 
seconds (180903936 bytes)
[  163.807793] test 10 (256 bit key, 16 byte blocks): 4862951 operations in 1 
seconds (77807216 bytes)
[  164.814449] test 11 (256 bit key, 64 byte blocks): 2050036 operations in 1 
seconds (131202304 bytes)
[  165.821121] test 12 (256 bit key, 256 byte blocks): 620349 operations in 1 
seconds (158809344 bytes)
[  166.827621] test 13 (256 bit key, 1024 byte blocks): 173917 operations in 1 
seconds (178091008 bytes)
[  167.834218] test 14 (256 bit key, 8192 byte blocks): 22362 operations in 1 
seconds (183189504 bytes)
[  168.840798] 
[  168.840798] testing speed of async ecb(twofish) decryption
[  168.849968] test 0 (128 bit key, 16 byte blocks): 4889899 operations in 1 
seconds (78238384 bytes)
[  169.855439] test 1 (128 bit key, 64 byte blocks): 2052293 operations in 1 
seconds (131346752 bytes)
[  170.862113] test 2 (128 bit key, 256 byte blocks): 616979 operations in 1 
seconds (157946624 bytes)
[  171.868631] test 3 (128 bit key, 1024 byte blocks): 172773 operations in 1 
seconds (176919552 bytes)
[  172.875244] test 4 (128 bit key, 8192 byte blocks): 4 operations in 1 
seconds (182059008 bytes)
[  173.881777] test 5 (192 bit key, 16 byte blocks): 4893653 operations in 1 
seconds (78298448 bytes)
[  174.888451] test 6 (192 bit key, 64 byte blocks): 2048078 operations in 1 
seconds (131076992 bytes)
[  175.895131] test 7 (192 bit key, 256 byte blocks): 619204 operations in 1 
seconds (158516224 bytes)
[  176.901651] test 8 (192 bit key, 1024 byte blocks): 172569 operations in 1 
seconds (176710656 bytes)
[  177.908253] test 9 (192 bit key, 8192 byte blocks): 21888 operations in 1 
seconds (179306496 bytes)
[  178.914781] test 10 (256 bit key, 16 byte blocks): 4921751 operations in 1 
seconds (78748016 bytes)
[  179.917481] test 11 (256 bit key, 64 byte blocks): 2051219 operations in 1 
seconds (131278016 bytes)
[  180.920147] test 12 (256 bit key, 256 byte blocks): 618536 operations in 1 
seconds (158345216 bytes)
[  181.926637] test 13 (256 bit key, 1024 byte blocks): 172886 operations in 1 
seconds (177035264 bytes)
[  182.933249] test 14 (256 bit key, 8192 byte blocks): 2 operations in 1 
seconds (182042624 bytes)
[  183.939803] 
[  183.939803] testing speed of async cbc(twofish) encryption
[  183.953902] test 0 (128 bit key, 16 byte blocks): 5195403 operations in 1 
seconds (83126448 bytes)
[  184.962487] test 1 (128 bit key, 64 byte blocks): 1912010 operations in 1 
seconds (122368640 bytes)
[  185.969150] test 2 (128 bit key, 256 byte blocks): 540125 operations in 1 
seconds (138272000 bytes)
[  186.975650] test 3 (128 bit key, 1024 byte blocks): 140631 operations in 1 
seconds (144006144 bytes)
[  187.982411] test 4 (128 bit key, 8192 byte blocks): 17737 operations in 1 
seconds (145301504 bytes)
[  188.988782] test 5 (192 bit key, 16 byte blocks): 5182287 operations in 1 
seconds (82916592 bytes)
[  189.995435] test 6 (192 bit key, 64 byte blocks): 1912356 operations in 1 
seconds (122390784 bytes)
[  191.002093] test 7 (192 bit key, 256 byte blocks): 540991 operations in 1 
seconds (138493696 bytes)
[  192.008600] test 8 (192 bit key, 1024 byte blocks): 140791 operations in 1 
seconds (144169984 bytes)
[  193.015197] test 9 (192 bit key, 8192 byte blocks): 17609 operations in 1 
seconds (144252928 bytes)
[  194.021740] test 10 (256 bit key, 16 byte blocks): 5191521 operations in 1 
seconds (83064336 bytes)
[  195.028534] test 11 (256 bit key, 64 byte blocks): 1906226 operations in 1 
seconds (121998464 bytes)
[  196.035069] test 12 (256 bit key, 256 byte blocks): 540479 operations in 1 
seconds (138362624 bytes)

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-22 Thread Jason Garrett-Glaser
On Wed, Aug 22, 2012 at 12:20 PM, Jussi Kivilinna
 wrote:
> Quoting Borislav Petkov :
>
>> On Wed, Aug 22, 2012 at 07:35:12AM +0300, Jussi Kivilinna wrote:
>>> Looks that encryption lost ~0.4% while decryption gained ~1.8%.
>>>
>>> For 256 byte test, it's still slightly slower than twofish-3way
>>> (~3%). For 1k
>>> and 8k tests, it's ~5% faster.
>>>
>>> Here's very last test-patch, testing different ordering of fpu<->cpu reg
>>> instructions at few places.
>>
>> Hehe,.
>>
>> I don't mind testing patches, no worries there. Here are the results
>> this time, doesn't look better than the last run, AFAICT.
>>
>
> Actually it does look better, at least for encryption. Decryption had 
> different
> ordering for test, which appears to be bad on bulldozer as it is on
> sandy-bridge.
>
> So, yet another patch then :)
>
> Interleaving at some new places (reordered lookup_32bit()s in G-macro) and
> doing one of the round rotations one round ahead. Also introduces some
> more paralellism inside lookup_32bit.

Outsider looking in here, but avoiding the 256-way lookup tables
entirely might be faster.  Looking at the twofish code, one byte-wise
calculation looks like this:

a0 = x >> 4; b0 = x & 15;
a1 = a0 ^ b0; b1 = ror4[b0] ^ ashx[a0];
a2 = qt0[n][a1]; b2 = qt1[n][b1];
a3 = a2 ^ b2; b3 = ror4[b2] ^ ashx[a2];
a4 = qt2[n][a3]; b4 = qt3[n][b3];
return (b4 << 4) | a4;

This means that you can do something like this pseudocode (Intel
syntax).  pshufb on ymm registers is AVX2, but splitting it into xmm
operations would probably be fine (as would using this for just a pure
SSE implementation!).  On AVX2 you' have to double the tables for both
ways, naturally.

constants:
pb_0x0f = {0x0f,0x0f,0x0f ... }
ashx: lookup table
ror4: lookup table
qt0[n]: lookup table
qt1[n]: lookup table
qt2[n]: lookup table
qt3[n]: lookup table

vpandb0, in, pb_0x0f
vpsrlw   a0, in, 4
vpanda0, a0, pb_0x0f; effectively vpsrlb, but that doesn't exist

vpxora1, a0, b0
vpshufb  a0,   ashx, a0
vpshufb  b0,   ror4, b0
vpxorb1, a0, b0

vpshufb  a2, qt0[n], a1
vpshufb  b2, qt1[n], b1

vpxora3, a2, b2
vpshufb  a3,   ashx, a2
vpshufb  b3,   ror4, b2
vpxorb3, a2, b2

vpshufb  a4, qt2[n], a3
vpshufb  b4, qt3[n], b3

vpsllw   b4, b4, 4  ; effectively vpsrlb, but that doesn't exist
vporout, a4, b4

That's 15 instructions (plus maybe a move or two) to do 16 lookups for
SSE (~9 cycles by my guessing on a Nehalem).  AVX would run into the
problem of lots of extra vinsert/vextract (just going 16-byte might be
better, might be not, depending on execution units).  AVX2 would be
super fast (15 for 32).

If this works, this could be quite a bit faster with the table-based approach.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-22 Thread Jussi Kivilinna
Quoting Borislav Petkov :

> On Wed, Aug 22, 2012 at 07:35:12AM +0300, Jussi Kivilinna wrote:
>> Looks that encryption lost ~0.4% while decryption gained ~1.8%.
>>
>> For 256 byte test, it's still slightly slower than twofish-3way
>> (~3%). For 1k
>> and 8k tests, it's ~5% faster.
>>
>> Here's very last test-patch, testing different ordering of fpu<->cpu reg
>> instructions at few places.
>
> Hehe,
>
> I don't mind testing patches, no worries there. Here are the results
> this time, doesn't look better than the last run, AFAICT.
>

Actually it does look better, at least for encryption. Decryption had different
ordering for test, which appears to be bad on bulldozer as it is on
sandy-bridge.

So, yet another patch then :)

Interleaving at some new places (reordered lookup_32bit()s in G-macro) and
doing one of the round rotations one round ahead. Also introduces some
more paralellism inside lookup_32bit.

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  227 +--
 1 file changed, 142 insertions(+), 85 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S 
b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..1585abb 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -4,6 +4,8 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * 
  *
+ * Copyright © 2012 Jussi Kivilinna 
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -47,16 +49,22 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
+
+#define RX1 %xmm10
+#define RY1 %xmm11
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
 
-#define RID1  %rax
-#define RID1b %al
-#define RID2  %rbx
-#define RID2b %bl
+#define RT %xmm14
+#define RR %xmm15
+
+#define RID1  %rbp
+#define RID1d %ebp
+#define RID2  %rsi
+#define RID2d %esi
 
 #define RGI1   %rdx
 #define RGI1bl %dl
@@ -65,6 +73,13 @@
 #define RGI2bl %cl
 #define RGI2bh %ch
 
+#define RGI3   %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4   %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
 #define RGS1  %r8
 #define RGS1d %r8d
 #define RGS2  %r9
@@ -73,89 +88,123 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
-   movlt0(CTX, RID1, 4), dst ## d;  \
-   xorlt1(CTX, RID2, 4), dst ## d;  \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
shrq $16,   src; \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+   movlt0(CTX, RID1, 4), dst ## d;  \
+   movlt1(CTX, RID2, 4), RID2d; \
+   movzbl  src ## bl,RID1d; \
+   xorlRID2d,dst ## d;  \
+   movzbl  src ## bh,RID2d; \
+   interleave_op(il_reg);   \
xorlt2(CTX, RID1, 4), dst ## d;  \
xorlt3(CTX, RID2, 4), dst ## d;
 
-#define G(a, x, t0, t1, t2, t3) \
-   vmovq   a,RGI1;   \
-   vpsrldq $8, a,x;  \
-   vmovq   x,RGI2;   \
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+   shrq $16,   reg;
+
+#define G(gi1, gi2, x, t0, t1, t2, t3) \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1);  \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2);  \
+   \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none);  \
+   shlq $32,   RGS2;\
+   orq RGS1, RGS2;  \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none);  \
+   shlq $32,   RGS1;\
+   orq RGS1, RGS3;
+
+#define round_head_2(a, b, x1, y1, x2, y2) \
+   vmovq   b ## 1, RGI3;   \
+   vpextrq $1, b ## 1, RGI4;   \
\
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-   shrq $16,   RGI1; \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
-   shlq $32,   RGS2; \
-   orq RGS1, RGS2;   \
+   G(RGI1, RGI2, x1, s0, s1, s2, s3);  \
+   vmovq   a ## 2, RGI1;   \
+   vpextrq $1, a ## 2, RGI2;   \
+   vmovq   RGS2, x1;   \
+   vpinsrq $1, RGS3, x1, x1;  

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-22 Thread Borislav Petkov
On Wed, Aug 22, 2012 at 07:35:12AM +0300, Jussi Kivilinna wrote:
> Looks that encryption lost ~0.4% while decryption gained ~1.8%.
> 
> For 256 byte test, it's still slightly slower than twofish-3way (~3%). For 1k
> and 8k tests, it's ~5% faster.
> 
> Here's very last test-patch, testing different ordering of fpu<->cpu reg
> instructions at few places.

Hehe,

I don't mind testing patches, no worries there. Here are the results
this time, doesn't look better than the last run, AFAICT.

[  133.952723] 
[  133.952723] testing speed of async ecb(twofish) encryption
[  133.961946] test 0 (128 bit key, 16 byte blocks): 4768513 operations in 1 
seconds (76296208 bytes)
[  134.968388] test 1 (128 bit key, 64 byte blocks): 2033479 operations in 1 
seconds (130142656 bytes)
[  135.975070] test 2 (128 bit key, 256 byte blocks): 604754 operations in 1 
seconds (154817024 bytes)
[  136.981570] test 3 (128 bit key, 1024 byte blocks): 169578 operations in 1 
seconds (173647872 bytes)
[  137.988191] test 4 (128 bit key, 8192 byte blocks): 21847 operations in 1 
seconds (178970624 bytes)
[  138.994735] test 5 (192 bit key, 16 byte blocks): 4777481 operations in 1 
seconds (76439696 bytes)
[  140.001382] test 6 (192 bit key, 64 byte blocks): 2035352 operations in 1 
seconds (130262528 bytes)
[  141.008038] test 7 (192 bit key, 256 byte blocks): 603240 operations in 1 
seconds (154429440 bytes)
[  142.014591] test 8 (192 bit key, 1024 byte blocks): 169266 operations in 1 
seconds (173328384 bytes)
[  143.021169] test 9 (192 bit key, 8192 byte blocks): 21610 operations in 1 
seconds (177029120 bytes)
[  144.027703] test 10 (256 bit key, 16 byte blocks): 4798051 operations in 1 
seconds (76768816 bytes)
[  145.034341] test 11 (256 bit key, 64 byte blocks): 2036766 operations in 1 
seconds (130353024 bytes)
[  146.041015] test 12 (256 bit key, 256 byte blocks): 604216 operations in 1 
seconds (154679296 bytes)
[  147.047523] test 13 (256 bit key, 1024 byte blocks): 169594 operations in 1 
seconds (173664256 bytes)
[  148.054120] test 14 (256 bit key, 8192 byte blocks): 21889 operations in 1 
seconds (179314688 bytes)
[  149.060657] 
[  149.060657] testing speed of async ecb(twofish) decryption
[  149.069830] test 0 (128 bit key, 16 byte blocks): 4890581 operations in 1 
seconds (78249296 bytes)
[  150.075322] test 1 (128 bit key, 64 byte blocks): 2006891 operations in 1 
seconds (128441024 bytes)
[  151.081994] test 2 (128 bit key, 256 byte blocks): 586650 operations in 1 
seconds (150182400 bytes)
[  152.088522] test 3 (128 bit key, 1024 byte blocks): 164734 operations in 1 
seconds (168687616 bytes)
[  153.091153] test 4 (128 bit key, 8192 byte blocks): 2 operations in 1 
seconds (172941312 bytes)
[  154.097687] test 5 (192 bit key, 16 byte blocks): 4911365 operations in 1 
seconds (78581840 bytes)
[  155.104371] test 6 (192 bit key, 64 byte blocks): 2025363 operations in 1 
seconds (129623232 bytes)
[  156.54] test 7 (192 bit key, 256 byte blocks): 591229 operations in 1 
seconds (151354624 bytes)
[  157.117723] test 8 (192 bit key, 1024 byte blocks): 164381 operations in 1 
seconds (168326144 bytes)
[  158.124336] test 9 (192 bit key, 8192 byte blocks): 20714 operations in 1 
seconds (169689088 bytes)
[  159.130724] test 10 (256 bit key, 16 byte blocks): 4931938 operations in 1 
seconds (78911008 bytes)
[  160.137379] test 11 (256 bit key, 64 byte blocks): 2029741 operations in 1 
seconds (129903424 bytes)
[  161.144078] test 12 (256 bit key, 256 byte blocks): 589340 operations in 1 
seconds (150871040 bytes)
[  162.150580] test 13 (256 bit key, 1024 byte blocks): 164484 operations in 1 
seconds (168431616 bytes)
[  163.157174] test 14 (256 bit key, 8192 byte blocks): 21116 operations in 1 
seconds (172982272 bytes)
[  164.163694] 
[  164.163694] testing speed of async cbc(twofish) encryption
[  164.12] test 0 (128 bit key, 16 byte blocks): 5197069 operations in 1 
seconds (83153104 bytes)
[  165.186414] test 1 (128 bit key, 64 byte blocks): 1912975 operations in 1 
seconds (122430400 bytes)
[  166.193078] test 2 (128 bit key, 256 byte blocks): 540464 operations in 1 
seconds (138358784 bytes)
[  167.199587] test 3 (128 bit key, 1024 byte blocks): 140709 operations in 1 
seconds (144086016 bytes)
[  168.206209] test 4 (128 bit key, 8192 byte blocks): 17747 operations in 1 
seconds (145383424 bytes)
[  169.212768] test 5 (192 bit key, 16 byte blocks): 5184004 operations in 1 
seconds (82944064 bytes)
[  170.219372] test 6 (192 bit key, 64 byte blocks): 1913377 operations in 1 
seconds (122456128 bytes)
[  171.226028] test 7 (192 bit key, 256 byte blocks): 541385 operations in 1 
seconds (138594560 bytes)
[  172.232538] test 8 (192 bit key, 1024 byte blocks): 140867 operations in 1 
seconds (144247808 bytes)
[  173.239280] test 9 (192 bit key, 8192 byte blocks): 17642 operations in 1 
seconds (144523264 bytes)
[  174.245667] test 10 (256 bit key, 16 byte blocks): 5193804 operations in 1 
seconds (83100864 bytes)

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-22 Thread Borislav Petkov
On Wed, Aug 22, 2012 at 07:35:12AM +0300, Jussi Kivilinna wrote:
 Looks that encryption lost ~0.4% while decryption gained ~1.8%.
 
 For 256 byte test, it's still slightly slower than twofish-3way (~3%). For 1k
 and 8k tests, it's ~5% faster.
 
 Here's very last test-patch, testing different ordering of fpu-cpu reg
 instructions at few places.

Hehe,

I don't mind testing patches, no worries there. Here are the results
this time, doesn't look better than the last run, AFAICT.

[  133.952723] 
[  133.952723] testing speed of async ecb(twofish) encryption
[  133.961946] test 0 (128 bit key, 16 byte blocks): 4768513 operations in 1 
seconds (76296208 bytes)
[  134.968388] test 1 (128 bit key, 64 byte blocks): 2033479 operations in 1 
seconds (130142656 bytes)
[  135.975070] test 2 (128 bit key, 256 byte blocks): 604754 operations in 1 
seconds (154817024 bytes)
[  136.981570] test 3 (128 bit key, 1024 byte blocks): 169578 operations in 1 
seconds (173647872 bytes)
[  137.988191] test 4 (128 bit key, 8192 byte blocks): 21847 operations in 1 
seconds (178970624 bytes)
[  138.994735] test 5 (192 bit key, 16 byte blocks): 4777481 operations in 1 
seconds (76439696 bytes)
[  140.001382] test 6 (192 bit key, 64 byte blocks): 2035352 operations in 1 
seconds (130262528 bytes)
[  141.008038] test 7 (192 bit key, 256 byte blocks): 603240 operations in 1 
seconds (154429440 bytes)
[  142.014591] test 8 (192 bit key, 1024 byte blocks): 169266 operations in 1 
seconds (173328384 bytes)
[  143.021169] test 9 (192 bit key, 8192 byte blocks): 21610 operations in 1 
seconds (177029120 bytes)
[  144.027703] test 10 (256 bit key, 16 byte blocks): 4798051 operations in 1 
seconds (76768816 bytes)
[  145.034341] test 11 (256 bit key, 64 byte blocks): 2036766 operations in 1 
seconds (130353024 bytes)
[  146.041015] test 12 (256 bit key, 256 byte blocks): 604216 operations in 1 
seconds (154679296 bytes)
[  147.047523] test 13 (256 bit key, 1024 byte blocks): 169594 operations in 1 
seconds (173664256 bytes)
[  148.054120] test 14 (256 bit key, 8192 byte blocks): 21889 operations in 1 
seconds (179314688 bytes)
[  149.060657] 
[  149.060657] testing speed of async ecb(twofish) decryption
[  149.069830] test 0 (128 bit key, 16 byte blocks): 4890581 operations in 1 
seconds (78249296 bytes)
[  150.075322] test 1 (128 bit key, 64 byte blocks): 2006891 operations in 1 
seconds (128441024 bytes)
[  151.081994] test 2 (128 bit key, 256 byte blocks): 586650 operations in 1 
seconds (150182400 bytes)
[  152.088522] test 3 (128 bit key, 1024 byte blocks): 164734 operations in 1 
seconds (168687616 bytes)
[  153.091153] test 4 (128 bit key, 8192 byte blocks): 2 operations in 1 
seconds (172941312 bytes)
[  154.097687] test 5 (192 bit key, 16 byte blocks): 4911365 operations in 1 
seconds (78581840 bytes)
[  155.104371] test 6 (192 bit key, 64 byte blocks): 2025363 operations in 1 
seconds (129623232 bytes)
[  156.54] test 7 (192 bit key, 256 byte blocks): 591229 operations in 1 
seconds (151354624 bytes)
[  157.117723] test 8 (192 bit key, 1024 byte blocks): 164381 operations in 1 
seconds (168326144 bytes)
[  158.124336] test 9 (192 bit key, 8192 byte blocks): 20714 operations in 1 
seconds (169689088 bytes)
[  159.130724] test 10 (256 bit key, 16 byte blocks): 4931938 operations in 1 
seconds (78911008 bytes)
[  160.137379] test 11 (256 bit key, 64 byte blocks): 2029741 operations in 1 
seconds (129903424 bytes)
[  161.144078] test 12 (256 bit key, 256 byte blocks): 589340 operations in 1 
seconds (150871040 bytes)
[  162.150580] test 13 (256 bit key, 1024 byte blocks): 164484 operations in 1 
seconds (168431616 bytes)
[  163.157174] test 14 (256 bit key, 8192 byte blocks): 21116 operations in 1 
seconds (172982272 bytes)
[  164.163694] 
[  164.163694] testing speed of async cbc(twofish) encryption
[  164.12] test 0 (128 bit key, 16 byte blocks): 5197069 operations in 1 
seconds (83153104 bytes)
[  165.186414] test 1 (128 bit key, 64 byte blocks): 1912975 operations in 1 
seconds (122430400 bytes)
[  166.193078] test 2 (128 bit key, 256 byte blocks): 540464 operations in 1 
seconds (138358784 bytes)
[  167.199587] test 3 (128 bit key, 1024 byte blocks): 140709 operations in 1 
seconds (144086016 bytes)
[  168.206209] test 4 (128 bit key, 8192 byte blocks): 17747 operations in 1 
seconds (145383424 bytes)
[  169.212768] test 5 (192 bit key, 16 byte blocks): 5184004 operations in 1 
seconds (82944064 bytes)
[  170.219372] test 6 (192 bit key, 64 byte blocks): 1913377 operations in 1 
seconds (122456128 bytes)
[  171.226028] test 7 (192 bit key, 256 byte blocks): 541385 operations in 1 
seconds (138594560 bytes)
[  172.232538] test 8 (192 bit key, 1024 byte blocks): 140867 operations in 1 
seconds (144247808 bytes)
[  173.239280] test 9 (192 bit key, 8192 byte blocks): 17642 operations in 1 
seconds (144523264 bytes)
[  174.245667] test 10 (256 bit key, 16 byte blocks): 5193804 operations in 1 
seconds (83100864 bytes)
[  

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-22 Thread Jussi Kivilinna
Quoting Borislav Petkov b...@alien8.de:

 On Wed, Aug 22, 2012 at 07:35:12AM +0300, Jussi Kivilinna wrote:
 Looks that encryption lost ~0.4% while decryption gained ~1.8%.

 For 256 byte test, it's still slightly slower than twofish-3way
 (~3%). For 1k
 and 8k tests, it's ~5% faster.

 Here's very last test-patch, testing different ordering of fpu-cpu reg
 instructions at few places.

 Hehe,

 I don't mind testing patches, no worries there. Here are the results
 this time, doesn't look better than the last run, AFAICT.


Actually it does look better, at least for encryption. Decryption had different
ordering for test, which appears to be bad on bulldozer as it is on
sandy-bridge.

So, yet another patch then :)

Interleaving at some new places (reordered lookup_32bit()s in G-macro) and
doing one of the round rotations one round ahead. Also introduces some
more paralellism inside lookup_32bit.

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  227 +--
 1 file changed, 142 insertions(+), 85 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S 
b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..1585abb 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -4,6 +4,8 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * johannes.goetzfr...@informatik.stud.uni-erlangen.de
  *
+ * Copyright © 2012 Jussi Kivilinna jussi.kivili...@mbnet.fi
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -47,16 +49,22 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
+
+#define RX1 %xmm10
+#define RY1 %xmm11
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
 
-#define RID1  %rax
-#define RID1b %al
-#define RID2  %rbx
-#define RID2b %bl
+#define RT %xmm14
+#define RR %xmm15
+
+#define RID1  %rbp
+#define RID1d %ebp
+#define RID2  %rsi
+#define RID2d %esi
 
 #define RGI1   %rdx
 #define RGI1bl %dl
@@ -65,6 +73,13 @@
 #define RGI2bl %cl
 #define RGI2bh %ch
 
+#define RGI3   %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4   %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
 #define RGS1  %r8
 #define RGS1d %r8d
 #define RGS2  %r9
@@ -73,89 +88,123 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
-   movlt0(CTX, RID1, 4), dst ## d;  \
-   xorlt1(CTX, RID2, 4), dst ## d;  \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
shrq $16,   src; \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+   movlt0(CTX, RID1, 4), dst ## d;  \
+   movlt1(CTX, RID2, 4), RID2d; \
+   movzbl  src ## bl,RID1d; \
+   xorlRID2d,dst ## d;  \
+   movzbl  src ## bh,RID2d; \
+   interleave_op(il_reg);   \
xorlt2(CTX, RID1, 4), dst ## d;  \
xorlt3(CTX, RID2, 4), dst ## d;
 
-#define G(a, x, t0, t1, t2, t3) \
-   vmovq   a,RGI1;   \
-   vpsrldq $8, a,x;  \
-   vmovq   x,RGI2;   \
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+   shrq $16,   reg;
+
+#define G(gi1, gi2, x, t0, t1, t2, t3) \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1);  \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2);  \
+   \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none);  \
+   shlq $32,   RGS2;\
+   orq RGS1, RGS2;  \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none);  \
+   shlq $32,   RGS1;\
+   orq RGS1, RGS3;
+
+#define round_head_2(a, b, x1, y1, x2, y2) \
+   vmovq   b ## 1, RGI3;   \
+   vpextrq $1, b ## 1, RGI4;   \
\
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-   shrq $16,   RGI1; \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
-   shlq $32,   RGS2; \
-   orq RGS1, RGS2;   \
+   G(RGI1, RGI2, x1, s0, s1, s2, s3);  \
+   vmovq   a ## 2, RGI1;   \
+   vpextrq $1, a ## 2, RGI2;   \
+   vmovq   RGS2, 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-22 Thread Jason Garrett-Glaser
On Wed, Aug 22, 2012 at 12:20 PM, Jussi Kivilinna
jussi.kivili...@mbnet.fi wrote:
 Quoting Borislav Petkov b...@alien8.de:

 On Wed, Aug 22, 2012 at 07:35:12AM +0300, Jussi Kivilinna wrote:
 Looks that encryption lost ~0.4% while decryption gained ~1.8%.

 For 256 byte test, it's still slightly slower than twofish-3way
 (~3%). For 1k
 and 8k tests, it's ~5% faster.

 Here's very last test-patch, testing different ordering of fpu-cpu reg
 instructions at few places.

 Hehe,.

 I don't mind testing patches, no worries there. Here are the results
 this time, doesn't look better than the last run, AFAICT.


 Actually it does look better, at least for encryption. Decryption had 
 different
 ordering for test, which appears to be bad on bulldozer as it is on
 sandy-bridge.

 So, yet another patch then :)

 Interleaving at some new places (reordered lookup_32bit()s in G-macro) and
 doing one of the round rotations one round ahead. Also introduces some
 more paralellism inside lookup_32bit.

Outsider looking in here, but avoiding the 256-way lookup tables
entirely might be faster.  Looking at the twofish code, one byte-wise
calculation looks like this:

a0 = x  4; b0 = x  15;
a1 = a0 ^ b0; b1 = ror4[b0] ^ ashx[a0];
a2 = qt0[n][a1]; b2 = qt1[n][b1];
a3 = a2 ^ b2; b3 = ror4[b2] ^ ashx[a2];
a4 = qt2[n][a3]; b4 = qt3[n][b3];
return (b4  4) | a4;

This means that you can do something like this pseudocode (Intel
syntax).  pshufb on ymm registers is AVX2, but splitting it into xmm
operations would probably be fine (as would using this for just a pure
SSE implementation!).  On AVX2 you' have to double the tables for both
ways, naturally.

constants:
pb_0x0f = {0x0f,0x0f,0x0f ... }
ashx: lookup table
ror4: lookup table
qt0[n]: lookup table
qt1[n]: lookup table
qt2[n]: lookup table
qt3[n]: lookup table

vpandb0, in, pb_0x0f
vpsrlw   a0, in, 4
vpanda0, a0, pb_0x0f; effectively vpsrlb, but that doesn't exist

vpxora1, a0, b0
vpshufb  a0,   ashx, a0
vpshufb  b0,   ror4, b0
vpxorb1, a0, b0

vpshufb  a2, qt0[n], a1
vpshufb  b2, qt1[n], b1

vpxora3, a2, b2
vpshufb  a3,   ashx, a2
vpshufb  b3,   ror4, b2
vpxorb3, a2, b2

vpshufb  a4, qt2[n], a3
vpshufb  b4, qt3[n], b3

vpsllw   b4, b4, 4  ; effectively vpsrlb, but that doesn't exist
vporout, a4, b4

That's 15 instructions (plus maybe a move or two) to do 16 lookups for
SSE (~9 cycles by my guessing on a Nehalem).  AVX would run into the
problem of lots of extra vinsert/vextract (just going 16-byte might be
better, might be not, depending on execution units).  AVX2 would be
super fast (15 for 32).

If this works, this could be quite a bit faster with the table-based approach.

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-21 Thread Jussi Kivilinna
Quoting Borislav Petkov :

> 
> Here you go:
> 
> [   52.282208]
> [   52.282208] testing speed of async ecb(twofish) encryption

Thanks!

Looks that encryption lost ~0.4% while decryption gained ~1.8%.

For 256 byte test, it's still slightly slower than twofish-3way (~3%). For 1k
and 8k tests, it's ~5% faster.

Here's very last test-patch, testing different ordering of fpu<->cpu reg
instructions at few places.

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  232 ++-
 1 file changed, 154 insertions(+), 78 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S 
b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..693963a 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -4,6 +4,8 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * 
  *
+ * Copyright © 2012 Jussi Kivilinna 
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -47,16 +49,21 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
+
+#define RX1 %xmm10
+#define RY1 %xmm11
+
+#define RK1 %xmm12
+#define RK2 %xmm13
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RT %xmm14
 
-#define RID1  %rax
-#define RID1b %al
-#define RID2  %rbx
-#define RID2b %bl
+#define RID1  %rbp
+#define RID1d %ebp
+#define RID2  %rsi
+#define RID2d %esi
 
 #define RGI1   %rdx
 #define RGI1bl %dl
@@ -65,6 +72,13 @@
 #define RGI2bl %cl
 #define RGI2bh %ch
 
+#define RGI3   %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4   %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
 #define RGS1  %r8
 #define RGS1d %r8d
 #define RGS2  %r9
@@ -73,40 +87,58 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   shrq $16,   src; \
movlt0(CTX, RID1, 4), dst ## d;  \
xorlt1(CTX, RID2, 4), dst ## d;  \
-   shrq $16,   src; \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   interleave_op(il_reg);   \
xorlt2(CTX, RID1, 4), dst ## d;  \
xorlt3(CTX, RID2, 4), dst ## d;
 
-#define G(a, x, t0, t1, t2, t3) \
-   vmovq   a,RGI1;   \
-   vpsrldq $8, a,x;  \
-   vmovq   x,RGI2;   \
-   \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-   shrq $16,   RGI1; \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
-   shlq $32,   RGS2; \
-   orq RGS1, RGS2;   \
-   \
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
-   shrq $16,   RGI2; \
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
-   shlq $32,   RGS3; \
-   orq RGS1, RGS3;   \
-   \
-   vmovq   RGS2, x;  \
-   vpinsrq $1, RGS3, x, x;
+#define dummy(d) /* do nothing */
 
-#define encround(a, b, c, d, x, y) \
-   G(a, x, s0, s1, s2, s3);   \
-   G(b, y, s1, s2, s3, s0);   \
+#define shr_next(reg) \
+   shrq $16,   reg;
+
+#define G_enc(gi1, gi2, x, t0, t1, t2, t3) \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1);  \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none);  \
+   shlq $32,   RGS2;\
+   orq RGS1, RGS2;  \
+   \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2);  \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none);  \
+   shlq $32,   RGS1;\
+   orq RGS1, RGS3;
+
+#define encround_head_2(a, b, c, d, x1, y1, x2, y2) \
+   vmovq   b ## 1, RGI3;   \
+   vpextrq $1, b ## 1, RGI4;   \
+   G_enc(RGI1, RGI2, x1, s0, s1, s2, s3);  \
+   vmovq   a ## 2, RGI1;   \
+   vpextrq $1, a ## 2, RGI2;   \
+   vmovq   RGS2, x1;   \
+   vpinsrq $1, RGS3, x1, x1;   \
+   G_enc(RGI3, RGI4, y1, s1, s2, s3, s0);  \
+   vmovq   b ## 2, RGI3;   \
+  

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-21 Thread Jussi Kivilinna
Quoting Borislav Petkov b...@alien8.de:

 
 Here you go:
 
 [   52.282208]
 [   52.282208] testing speed of async ecb(twofish) encryption

Thanks!

Looks that encryption lost ~0.4% while decryption gained ~1.8%.

For 256 byte test, it's still slightly slower than twofish-3way (~3%). For 1k
and 8k tests, it's ~5% faster.

Here's very last test-patch, testing different ordering of fpu-cpu reg
instructions at few places.

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  232 ++-
 1 file changed, 154 insertions(+), 78 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S 
b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..693963a 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -4,6 +4,8 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * johannes.goetzfr...@informatik.stud.uni-erlangen.de
  *
+ * Copyright © 2012 Jussi Kivilinna jussi.kivili...@mbnet.fi
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -47,16 +49,21 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
+
+#define RX1 %xmm10
+#define RY1 %xmm11
+
+#define RK1 %xmm12
+#define RK2 %xmm13
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RT %xmm14
 
-#define RID1  %rax
-#define RID1b %al
-#define RID2  %rbx
-#define RID2b %bl
+#define RID1  %rbp
+#define RID1d %ebp
+#define RID2  %rsi
+#define RID2d %esi
 
 #define RGI1   %rdx
 #define RGI1bl %dl
@@ -65,6 +72,13 @@
 #define RGI2bl %cl
 #define RGI2bh %ch
 
+#define RGI3   %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4   %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
 #define RGS1  %r8
 #define RGS1d %r8d
 #define RGS2  %r9
@@ -73,40 +87,58 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   shrq $16,   src; \
movlt0(CTX, RID1, 4), dst ## d;  \
xorlt1(CTX, RID2, 4), dst ## d;  \
-   shrq $16,   src; \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   interleave_op(il_reg);   \
xorlt2(CTX, RID1, 4), dst ## d;  \
xorlt3(CTX, RID2, 4), dst ## d;
 
-#define G(a, x, t0, t1, t2, t3) \
-   vmovq   a,RGI1;   \
-   vpsrldq $8, a,x;  \
-   vmovq   x,RGI2;   \
-   \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-   shrq $16,   RGI1; \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
-   shlq $32,   RGS2; \
-   orq RGS1, RGS2;   \
-   \
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
-   shrq $16,   RGI2; \
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
-   shlq $32,   RGS3; \
-   orq RGS1, RGS3;   \
-   \
-   vmovq   RGS2, x;  \
-   vpinsrq $1, RGS3, x, x;
+#define dummy(d) /* do nothing */
 
-#define encround(a, b, c, d, x, y) \
-   G(a, x, s0, s1, s2, s3);   \
-   G(b, y, s1, s2, s3, s0);   \
+#define shr_next(reg) \
+   shrq $16,   reg;
+
+#define G_enc(gi1, gi2, x, t0, t1, t2, t3) \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1);  \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none);  \
+   shlq $32,   RGS2;\
+   orq RGS1, RGS2;  \
+   \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2);  \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none);  \
+   shlq $32,   RGS1;\
+   orq RGS1, RGS3;
+
+#define encround_head_2(a, b, c, d, x1, y1, x2, y2) \
+   vmovq   b ## 1, RGI3;   \
+   vpextrq $1, b ## 1, RGI4;   \
+   G_enc(RGI1, RGI2, x1, s0, s1, s2, s3);  \
+   vmovq   a ## 2, RGI1;   \
+   vpextrq $1, a ## 2, RGI2;   \
+   vmovq   RGS2, x1;   \
+   vpinsrq $1, RGS3, x1, x1;   \
+   G_enc(RGI3, 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-20 Thread Borislav Petkov
On Fri, Aug 17, 2012 at 10:37:10AM +0300, Jussi Kivilinna wrote:
> I made few further changes, mainly moving/interleaving 'vmovq/vpextrq'
> ahead so they should be completed before those target registers are
> needed. This only gave 0.5% increase on Sandy-bridge, but might help
> more on Bulldozer.

Here you go:

[   52.282208] 
[   52.282208] testing speed of async ecb(twofish) encryption
[   52.291580] test 0 (128 bit key, 16 byte blocks): 4890079 operations in 1 
seconds (78241264 bytes)
[   53.301588] test 1 (128 bit key, 64 byte blocks): 2045945 operations in 1 
seconds (130940480 bytes)
[   54.309656] test 2 (128 bit key, 256 byte blocks): 604184 operations in 1 
seconds (154671104 bytes)
[   55.317289] test 3 (128 bit key, 1024 byte blocks): 168541 operations in 1 
seconds (172585984 bytes)
[   56.325565] test 4 (128 bit key, 8192 byte blocks): 21673 operations in 1 
seconds (177545216 bytes)
[   57.333529] test 5 (192 bit key, 16 byte blocks): 4877931 operations in 1 
seconds (78046896 bytes)
[   58.341588] test 6 (192 bit key, 64 byte blocks): 2044495 operations in 1 
seconds (130847680 bytes)
[   59.349647] test 7 (192 bit key, 256 byte blocks): 604909 operations in 1 
seconds (154856704 bytes)
[   60.357533] test 8 (192 bit key, 1024 byte blocks): 167836 operations in 1 
seconds (171864064 bytes)
[   61.365545] test 9 (192 bit key, 8192 byte blocks): 21439 operations in 1 
seconds (175628288 bytes)
[   62.369497] test 10 (256 bit key, 16 byte blocks): 4907149 operations in 1 
seconds (78514384 bytes)
[   63.373535] test 11 (256 bit key, 64 byte blocks): 2060437 operations in 1 
seconds (131867968 bytes)
[   64.381620] test 12 (256 bit key, 256 byte blocks): 604784 operations in 1 
seconds (154824704 bytes)
[   65.389523] test 13 (256 bit key, 1024 byte blocks): 168547 operations in 1 
seconds (172592128 bytes)
[   66.397520] test 14 (256 bit key, 8192 byte blocks): 21682 operations in 1 
seconds (177618944 bytes)
[   67.405461] 
[   67.405461] testing speed of async ecb(twofish) decryption
[   67.414776] test 0 (128 bit key, 16 byte blocks): 4903251 operations in 1 
seconds (78452016 bytes)
[   68.421569] test 1 (128 bit key, 64 byte blocks): 1979230 operations in 1 
seconds (126670720 bytes)
[   69.429644] test 2 (128 bit key, 256 byte blocks): 591549 operations in 1 
seconds (151436544 bytes)
[   70.437574] test 3 (128 bit key, 1024 byte blocks): 166478 operations in 1 
seconds (170473472 bytes)
[   71.445590] test 4 (128 bit key, 8192 byte blocks): 21441 operations in 1 
seconds (175644672 bytes)
[   72.453536] test 5 (192 bit key, 16 byte blocks): 4895430 operations in 1 
seconds (78326880 bytes)
[   73.461596] test 6 (192 bit key, 64 byte blocks): 1976120 operations in 1 
seconds (126471680 bytes)
[   74.469680] test 7 (192 bit key, 256 byte blocks): 590021 operations in 1 
seconds (151045376 bytes)
[   75.477600] test 8 (192 bit key, 1024 byte blocks): 165925 operations in 1 
seconds (169907200 bytes)
[   76.485606] test 9 (192 bit key, 8192 byte blocks): 21087 operations in 1 
seconds (172744704 bytes)
[   77.493561] test 10 (256 bit key, 16 byte blocks): 4882275 operations in 1 
seconds (78116400 bytes)
[   78.501621] test 11 (256 bit key, 64 byte blocks): 1976460 operations in 1 
seconds (126493440 bytes)
[   79.509706] test 12 (256 bit key, 256 byte blocks): 591122 operations in 1 
seconds (151327232 bytes)
[   80.517617] test 13 (256 bit key, 1024 byte blocks): 166587 operations in 1 
seconds (170585088 bytes)
[   81.525606] test 14 (256 bit key, 8192 byte blocks): 21439 operations in 1 
seconds (175628288 bytes)
[   82.533520] 
[   82.533520] testing speed of async cbc(twofish) encryption
[   82.547843] test 0 (128 bit key, 16 byte blocks): 5182177 operations in 1 
seconds (82914832 bytes)
[   83.557344] test 1 (128 bit key, 64 byte blocks): 1913550 operations in 1 
seconds (122467200 bytes)
[   84.565418] test 2 (128 bit key, 256 byte blocks): 540406 operations in 1 
seconds (138343936 bytes)
[   85.573320] test 3 (128 bit key, 1024 byte blocks): 141160 operations in 1 
seconds (144547840 bytes)
[   86.581346] test 4 (128 bit key, 8192 byte blocks): 17791 operations in 1 
seconds (145743872 bytes)
[   87.589283] test 5 (192 bit key, 16 byte blocks): 5167742 operations in 1 
seconds (82683872 bytes)
[   88.597316] test 6 (192 bit key, 64 byte blocks): 1913755 operations in 1 
seconds (122480320 bytes)
[   89.605689] test 7 (192 bit key, 256 byte blocks): 541933 operations in 1 
seconds (138734848 bytes)
[   90.613599] test 8 (192 bit key, 1024 byte blocks): 141155 operations in 1 
seconds (144542720 bytes)
[   91.621597] test 9 (192 bit key, 8192 byte blocks): 17652 operations in 1 
seconds (144605184 bytes)
[   92.629509] test 10 (256 bit key, 16 byte blocks): 5166590 operations in 1 
seconds (82665440 bytes)
[   93.637594] test 11 (256 bit key, 64 byte blocks): 1906451 operations in 1 
seconds (122012864 bytes)
[   94.645680] test 12 (256 bit key, 256 byte blocks): 541165 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-20 Thread Borislav Petkov
On Fri, Aug 17, 2012 at 10:37:10AM +0300, Jussi Kivilinna wrote:
 I made few further changes, mainly moving/interleaving 'vmovq/vpextrq'
 ahead so they should be completed before those target registers are
 needed. This only gave 0.5% increase on Sandy-bridge, but might help
 more on Bulldozer.

Here you go:

[   52.282208] 
[   52.282208] testing speed of async ecb(twofish) encryption
[   52.291580] test 0 (128 bit key, 16 byte blocks): 4890079 operations in 1 
seconds (78241264 bytes)
[   53.301588] test 1 (128 bit key, 64 byte blocks): 2045945 operations in 1 
seconds (130940480 bytes)
[   54.309656] test 2 (128 bit key, 256 byte blocks): 604184 operations in 1 
seconds (154671104 bytes)
[   55.317289] test 3 (128 bit key, 1024 byte blocks): 168541 operations in 1 
seconds (172585984 bytes)
[   56.325565] test 4 (128 bit key, 8192 byte blocks): 21673 operations in 1 
seconds (177545216 bytes)
[   57.333529] test 5 (192 bit key, 16 byte blocks): 4877931 operations in 1 
seconds (78046896 bytes)
[   58.341588] test 6 (192 bit key, 64 byte blocks): 2044495 operations in 1 
seconds (130847680 bytes)
[   59.349647] test 7 (192 bit key, 256 byte blocks): 604909 operations in 1 
seconds (154856704 bytes)
[   60.357533] test 8 (192 bit key, 1024 byte blocks): 167836 operations in 1 
seconds (171864064 bytes)
[   61.365545] test 9 (192 bit key, 8192 byte blocks): 21439 operations in 1 
seconds (175628288 bytes)
[   62.369497] test 10 (256 bit key, 16 byte blocks): 4907149 operations in 1 
seconds (78514384 bytes)
[   63.373535] test 11 (256 bit key, 64 byte blocks): 2060437 operations in 1 
seconds (131867968 bytes)
[   64.381620] test 12 (256 bit key, 256 byte blocks): 604784 operations in 1 
seconds (154824704 bytes)
[   65.389523] test 13 (256 bit key, 1024 byte blocks): 168547 operations in 1 
seconds (172592128 bytes)
[   66.397520] test 14 (256 bit key, 8192 byte blocks): 21682 operations in 1 
seconds (177618944 bytes)
[   67.405461] 
[   67.405461] testing speed of async ecb(twofish) decryption
[   67.414776] test 0 (128 bit key, 16 byte blocks): 4903251 operations in 1 
seconds (78452016 bytes)
[   68.421569] test 1 (128 bit key, 64 byte blocks): 1979230 operations in 1 
seconds (126670720 bytes)
[   69.429644] test 2 (128 bit key, 256 byte blocks): 591549 operations in 1 
seconds (151436544 bytes)
[   70.437574] test 3 (128 bit key, 1024 byte blocks): 166478 operations in 1 
seconds (170473472 bytes)
[   71.445590] test 4 (128 bit key, 8192 byte blocks): 21441 operations in 1 
seconds (175644672 bytes)
[   72.453536] test 5 (192 bit key, 16 byte blocks): 4895430 operations in 1 
seconds (78326880 bytes)
[   73.461596] test 6 (192 bit key, 64 byte blocks): 1976120 operations in 1 
seconds (126471680 bytes)
[   74.469680] test 7 (192 bit key, 256 byte blocks): 590021 operations in 1 
seconds (151045376 bytes)
[   75.477600] test 8 (192 bit key, 1024 byte blocks): 165925 operations in 1 
seconds (169907200 bytes)
[   76.485606] test 9 (192 bit key, 8192 byte blocks): 21087 operations in 1 
seconds (172744704 bytes)
[   77.493561] test 10 (256 bit key, 16 byte blocks): 4882275 operations in 1 
seconds (78116400 bytes)
[   78.501621] test 11 (256 bit key, 64 byte blocks): 1976460 operations in 1 
seconds (126493440 bytes)
[   79.509706] test 12 (256 bit key, 256 byte blocks): 591122 operations in 1 
seconds (151327232 bytes)
[   80.517617] test 13 (256 bit key, 1024 byte blocks): 166587 operations in 1 
seconds (170585088 bytes)
[   81.525606] test 14 (256 bit key, 8192 byte blocks): 21439 operations in 1 
seconds (175628288 bytes)
[   82.533520] 
[   82.533520] testing speed of async cbc(twofish) encryption
[   82.547843] test 0 (128 bit key, 16 byte blocks): 5182177 operations in 1 
seconds (82914832 bytes)
[   83.557344] test 1 (128 bit key, 64 byte blocks): 1913550 operations in 1 
seconds (122467200 bytes)
[   84.565418] test 2 (128 bit key, 256 byte blocks): 540406 operations in 1 
seconds (138343936 bytes)
[   85.573320] test 3 (128 bit key, 1024 byte blocks): 141160 operations in 1 
seconds (144547840 bytes)
[   86.581346] test 4 (128 bit key, 8192 byte blocks): 17791 operations in 1 
seconds (145743872 bytes)
[   87.589283] test 5 (192 bit key, 16 byte blocks): 5167742 operations in 1 
seconds (82683872 bytes)
[   88.597316] test 6 (192 bit key, 64 byte blocks): 1913755 operations in 1 
seconds (122480320 bytes)
[   89.605689] test 7 (192 bit key, 256 byte blocks): 541933 operations in 1 
seconds (138734848 bytes)
[   90.613599] test 8 (192 bit key, 1024 byte blocks): 141155 operations in 1 
seconds (144542720 bytes)
[   91.621597] test 9 (192 bit key, 8192 byte blocks): 17652 operations in 1 
seconds (144605184 bytes)
[   92.629509] test 10 (256 bit key, 16 byte blocks): 5166590 operations in 1 
seconds (82665440 bytes)
[   93.637594] test 11 (256 bit key, 64 byte blocks): 1906451 operations in 1 
seconds (122012864 bytes)
[   94.645680] test 12 (256 bit key, 256 byte blocks): 541165 operations 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-17 Thread Jussi Kivilinna
Quoting Borislav Petkov :

>
> Yep, looks better than the previous run and also a bit better or on par
> with the initial run I did.
>

I made few further changes, mainly moving/interleaving 'vmovq/vpextrq' ahead
so they should be completed before those target registers are needed. This
only gave 0.5% increase on Sandy-bridge, but might help more on Bulldozer.

-Jussi

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  205 +--
 1 file changed, 130 insertions(+), 75 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S 
b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..6638a87 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -4,6 +4,8 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * 
  *
+ * Copyright © 2012 Jussi Kivilinna 
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -47,16 +49,21 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
+
+#define RX1 %xmm10
+#define RY1 %xmm11
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
 
-#define RID1  %rax
-#define RID1b %al
-#define RID2  %rbx
-#define RID2b %bl
+#define RT %xmm14
+
+#define RID1  %rbp
+#define RID1d %ebp
+#define RID2  %rsi
+#define RID2d %esi
 
 #define RGI1   %rdx
 #define RGI1bl %dl
@@ -65,6 +72,13 @@
 #define RGI2bl %cl
 #define RGI2bh %ch
 
+#define RGI3   %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4   %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
 #define RGS1  %r8
 #define RGS1d %r8d
 #define RGS2  %r9
@@ -73,40 +87,53 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   shrq $16,   src; \
movlt0(CTX, RID1, 4), dst ## d;  \
xorlt1(CTX, RID2, 4), dst ## d;  \
-   shrq $16,   src; \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   interleave_op(il_reg);   \
xorlt2(CTX, RID1, 4), dst ## d;  \
xorlt3(CTX, RID2, 4), dst ## d;
 
-#define G(a, x, t0, t1, t2, t3) \
-   vmovq   a,RGI1;   \
-   vpsrldq $8, a,x;  \
-   vmovq   x,RGI2;   \
-   \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-   shrq $16,   RGI1; \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
-   shlq $32,   RGS2; \
-   orq RGS1, RGS2;   \
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+   shrq $16,   reg;
+
+#define G(gi1, gi2, x, t0, t1, t2, t3) \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1);  \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none);  \
+   shlq $32,   RGS2;\
+   orq RGS1, RGS2;  \
\
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
-   shrq $16,   RGI2; \
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
-   shlq $32,   RGS3; \
-   orq RGS1, RGS3;   \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2);  \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none);  \
+   shlq $32,   RGS1;\
+   orq RGS1, RGS3;  \
\
-   vmovq   RGS2, x;  \
+   vmovq   RGS2, x; \
vpinsrq $1, RGS3, x, x;
 
-#define encround(a, b, c, d, x, y) \
-   G(a, x, s0, s1, s2, s3);   \
-   G(b, y, s1, s2, s3, s0);   \
+#define encround_head_2(a, b, c, d, x1, y1, x2, y2) \
+   vmovq   b ## 1, RGI3;   \
+   vpextrq $1, b ## 1, RGI4;   \
+   G(RGI1, RGI2, x1, s0, s1, s2, s3);  \
+   vmovq   a ## 2, RGI1;   \
+   vpextrq $1, a ## 2, RGI2;   \
+   G(RGI3, RGI4, y1, s1, s2, s3, s0);  \
+   vmovq   b ## 2, RGI3;   \
+   vpextrq $1, b ## 2, RGI4;   \
+   G(RGI1, 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-17 Thread Jussi Kivilinna
Quoting Borislav Petkov b...@alien8.de:


 Yep, looks better than the previous run and also a bit better or on par
 with the initial run I did.


I made few further changes, mainly moving/interleaving 'vmovq/vpextrq' ahead
so they should be completed before those target registers are needed. This
only gave 0.5% increase on Sandy-bridge, but might help more on Bulldozer.

-Jussi

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  205 +--
 1 file changed, 130 insertions(+), 75 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S 
b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..6638a87 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -4,6 +4,8 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * johannes.goetzfr...@informatik.stud.uni-erlangen.de
  *
+ * Copyright © 2012 Jussi Kivilinna jussi.kivili...@mbnet.fi
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -47,16 +49,21 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
+
+#define RX1 %xmm10
+#define RY1 %xmm11
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
 
-#define RID1  %rax
-#define RID1b %al
-#define RID2  %rbx
-#define RID2b %bl
+#define RT %xmm14
+
+#define RID1  %rbp
+#define RID1d %ebp
+#define RID2  %rsi
+#define RID2d %esi
 
 #define RGI1   %rdx
 #define RGI1bl %dl
@@ -65,6 +72,13 @@
 #define RGI2bl %cl
 #define RGI2bh %ch
 
+#define RGI3   %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4   %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
 #define RGS1  %r8
 #define RGS1d %r8d
 #define RGS2  %r9
@@ -73,40 +87,53 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   shrq $16,   src; \
movlt0(CTX, RID1, 4), dst ## d;  \
xorlt1(CTX, RID2, 4), dst ## d;  \
-   shrq $16,   src; \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   interleave_op(il_reg);   \
xorlt2(CTX, RID1, 4), dst ## d;  \
xorlt3(CTX, RID2, 4), dst ## d;
 
-#define G(a, x, t0, t1, t2, t3) \
-   vmovq   a,RGI1;   \
-   vpsrldq $8, a,x;  \
-   vmovq   x,RGI2;   \
-   \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-   shrq $16,   RGI1; \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
-   shlq $32,   RGS2; \
-   orq RGS1, RGS2;   \
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+   shrq $16,   reg;
+
+#define G(gi1, gi2, x, t0, t1, t2, t3) \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1);  \
+   lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none);  \
+   shlq $32,   RGS2;\
+   orq RGS1, RGS2;  \
\
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
-   shrq $16,   RGI2; \
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
-   shlq $32,   RGS3; \
-   orq RGS1, RGS3;   \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2);  \
+   lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none);  \
+   shlq $32,   RGS1;\
+   orq RGS1, RGS3;  \
\
-   vmovq   RGS2, x;  \
+   vmovq   RGS2, x; \
vpinsrq $1, RGS3, x, x;
 
-#define encround(a, b, c, d, x, y) \
-   G(a, x, s0, s1, s2, s3);   \
-   G(b, y, s1, s2, s3, s0);   \
+#define encround_head_2(a, b, c, d, x1, y1, x2, y2) \
+   vmovq   b ## 1, RGI3;   \
+   vpextrq $1, b ## 1, RGI4;   \
+   G(RGI1, RGI2, x1, s0, s1, s2, s3);  \
+   vmovq   a ## 2, RGI1;   \
+   vpextrq $1, a ## 2, RGI2;   \
+   G(RGI3, RGI4, y1, s1, s2, s3, s0);  \
+   vmovq   b 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-16 Thread Jussi Kivilinna

Quoting Borislav Petkov :


On Wed, Aug 15, 2012 at 08:34:25PM +0300, Jussi Kivilinna wrote:

About ~5% slower, probably because I was tuning for sandy-bridge and
introduced more FPU<=>CPU register moves.

Here's new version of patch, with FPU<=>CPU moves from original
implementation.

(Note: also changes encryption function to inline all code in to main
function, decryption still places common code to separate function to
reduce object size. This is to measure the difference.)


Yep, looks better than the previous run and also a bit better or on par
with the initial run I did.


Thanks again. Speed gained with patch is ~8%, and is able of getting  
twofish-avx pass twofish-3way.




The thing is, I'm not sure whether optimizing the thing for each uarch
is a workable solution software-wise or maybe having a single version
which performs sufficiently ok on all uarches is easier/better to
maintain without causing code bloat. Hmmm...


Agreed, testing on multiple CPUs to get single well working version is  
what I have done in the past. But purchasing all the latest CPUs on  
the market isn't option for me, and for testing AVX I'm stuck with  
sandy-bridge :)


-Jussi


4th:

ran like 1st.

[ 1014.074150]
[ 1014.074150] testing speed of async ecb(twofish) encryption
[ 1014.083829] test 0 (128 bit key, 16 byte blocks): 4870055  
operations in 1 seconds (77920880 bytes)
[ 1015.092757] test 1 (128 bit key, 64 byte blocks): 2043828  
operations in 1 seconds (130804992 bytes)
[ 1016.099441] test 2 (128 bit key, 256 byte blocks): 606400  
operations in 1 seconds (155238400 bytes)
[ 1017.105939] test 3 (128 bit key, 1024 byte blocks): 168939  
operations in 1 seconds (172993536 bytes)
[ 1018.112517] test 4 (128 bit key, 8192 byte blocks): 21777  
operations in 1 seconds (178397184 bytes)
[ 1019.119035] test 5 (192 bit key, 16 byte blocks): 4882254  
operations in 1 seconds (78116064 bytes)
[ 1020.125716] test 6 (192 bit key, 64 byte blocks): 2043230  
operations in 1 seconds (130766720 bytes)
[ 1021.132391] test 7 (192 bit key, 256 byte blocks): 607477  
operations in 1 seconds (155514112 bytes)
[ 1022.138889] test 8 (192 bit key, 1024 byte blocks): 168743  
operations in 1 seconds (172792832 bytes)
[ 1023.145476] test 9 (192 bit key, 8192 byte blocks): 21442  
operations in 1 seconds (175652864 bytes)
[ 1024.152012] test 10 (256 bit key, 16 byte blocks): 4891863  
operations in 1 seconds (78269808 bytes)
[ 1025.158684] test 11 (256 bit key, 64 byte blocks): 2049390  
operations in 1 seconds (131160960 bytes)
[ 1026.165366] test 12 (256 bit key, 256 byte blocks): 606847  
operations in 1 seconds (155352832 bytes)
[ 1027.171841] test 13 (256 bit key, 1024 byte blocks): 169228  
operations in 1 seconds (173289472 bytes)
[ 1028.178436] test 14 (256 bit key, 8192 byte blocks): 21773  
operations in 1 seconds (178364416 bytes)

[ 1029.184981]
[ 1029.184981] testing speed of async ecb(twofish) decryption
[ 1029.194508] test 0 (128 bit key, 16 byte blocks): 4931065  
operations in 1 seconds (78897040 bytes)
[ 1030.199640] test 1 (128 bit key, 64 byte blocks): 2056931  
operations in 1 seconds (131643584 bytes)
[ 1031.206303] test 2 (128 bit key, 256 byte blocks): 589409  
operations in 1 seconds (150888704 bytes)
[ 1032.212832] test 3 (128 bit key, 1024 byte blocks): 163681  
operations in 1 seconds (167609344 bytes)
[ 1033.219443] test 4 (128 bit key, 8192 byte blocks): 21062  
operations in 1 seconds (172539904 bytes)
[ 1034.225979] test 5 (192 bit key, 16 byte blocks): 4931537  
operations in 1 seconds (78904592 bytes)
[ 1035.232608] test 6 (192 bit key, 64 byte blocks): 2053989  
operations in 1 seconds (131455296 bytes)
[ 1036.239289] test 7 (192 bit key, 256 byte blocks): 589591  
operations in 1 seconds (150935296 bytes)
[ 1037.241784] test 8 (192 bit key, 1024 byte blocks): 163565  
operations in 1 seconds (167490560 bytes)
[ 1038.244387] test 9 (192 bit key, 8192 byte blocks): 20899  
operations in 1 seconds (171204608 bytes)
[ 1039.250923] test 10 (256 bit key, 16 byte blocks): 4937343  
operations in 1 seconds (78997488 bytes)
[ 1040.257589] test 11 (256 bit key, 64 byte blocks): 2050678  
operations in 1 seconds (131243392 bytes)
[ 1041.264262] test 12 (256 bit key, 256 byte blocks): 586869  
operations in 1 seconds (150238464 bytes)
[ 1042.270753] test 13 (256 bit key, 1024 byte blocks): 163548  
operations in 1 seconds (167473152 bytes)
[ 1043.277365] test 14 (256 bit key, 8192 byte blocks): 21053  
operations in 1 seconds (172466176 bytes)

[ 1044.283892]
[ 1044.283892] testing speed of async cbc(twofish) encryption
[ 1044.293349] test 0 (128 bit key, 16 byte blocks): 5186240  
operations in 1 seconds (82979840 bytes)
[ 1045.298534] test 1 (128 bit key, 64 byte blocks): 1921034  
operations in 1 seconds (122946176 bytes)
[ 1046.305207] test 2 (128 bit key, 256 byte blocks): 542787  
operations in 1 seconds (138953472 bytes)
[ 1047.311699] test 3 (128 bit key, 1024 byte blocks): 141399  
operations in 1 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-16 Thread Borislav Petkov
On Wed, Aug 15, 2012 at 08:34:25PM +0300, Jussi Kivilinna wrote:
> About ~5% slower, probably because I was tuning for sandy-bridge and
> introduced more FPU<=>CPU register moves.
>
> Here's new version of patch, with FPU<=>CPU moves from original
> implementation.
>
> (Note: also changes encryption function to inline all code in to main
> function, decryption still places common code to separate function to
> reduce object size. This is to measure the difference.)

Yep, looks better than the previous run and also a bit better or on par
with the initial run I did.

The thing is, I'm not sure whether optimizing the thing for each uarch
is a workable solution software-wise or maybe having a single version
which performs sufficiently ok on all uarches is easier/better to
maintain without causing code bloat. Hmmm...

4th:

ran like 1st.

[ 1014.074150] 
[ 1014.074150] testing speed of async ecb(twofish) encryption
[ 1014.083829] test 0 (128 bit key, 16 byte blocks): 4870055 operations in 1 
seconds (77920880 bytes)
[ 1015.092757] test 1 (128 bit key, 64 byte blocks): 2043828 operations in 1 
seconds (130804992 bytes)
[ 1016.099441] test 2 (128 bit key, 256 byte blocks): 606400 operations in 1 
seconds (155238400 bytes)
[ 1017.105939] test 3 (128 bit key, 1024 byte blocks): 168939 operations in 1 
seconds (172993536 bytes)
[ 1018.112517] test 4 (128 bit key, 8192 byte blocks): 21777 operations in 1 
seconds (178397184 bytes)
[ 1019.119035] test 5 (192 bit key, 16 byte blocks): 4882254 operations in 1 
seconds (78116064 bytes)
[ 1020.125716] test 6 (192 bit key, 64 byte blocks): 2043230 operations in 1 
seconds (130766720 bytes)
[ 1021.132391] test 7 (192 bit key, 256 byte blocks): 607477 operations in 1 
seconds (155514112 bytes)
[ 1022.138889] test 8 (192 bit key, 1024 byte blocks): 168743 operations in 1 
seconds (172792832 bytes)
[ 1023.145476] test 9 (192 bit key, 8192 byte blocks): 21442 operations in 1 
seconds (175652864 bytes)
[ 1024.152012] test 10 (256 bit key, 16 byte blocks): 4891863 operations in 1 
seconds (78269808 bytes)
[ 1025.158684] test 11 (256 bit key, 64 byte blocks): 2049390 operations in 1 
seconds (131160960 bytes)
[ 1026.165366] test 12 (256 bit key, 256 byte blocks): 606847 operations in 1 
seconds (155352832 bytes)
[ 1027.171841] test 13 (256 bit key, 1024 byte blocks): 169228 operations in 1 
seconds (173289472 bytes)
[ 1028.178436] test 14 (256 bit key, 8192 byte blocks): 21773 operations in 1 
seconds (178364416 bytes)
[ 1029.184981] 
[ 1029.184981] testing speed of async ecb(twofish) decryption
[ 1029.194508] test 0 (128 bit key, 16 byte blocks): 4931065 operations in 1 
seconds (78897040 bytes)
[ 1030.199640] test 1 (128 bit key, 64 byte blocks): 2056931 operations in 1 
seconds (131643584 bytes)
[ 1031.206303] test 2 (128 bit key, 256 byte blocks): 589409 operations in 1 
seconds (150888704 bytes)
[ 1032.212832] test 3 (128 bit key, 1024 byte blocks): 163681 operations in 1 
seconds (167609344 bytes)
[ 1033.219443] test 4 (128 bit key, 8192 byte blocks): 21062 operations in 1 
seconds (172539904 bytes)
[ 1034.225979] test 5 (192 bit key, 16 byte blocks): 4931537 operations in 1 
seconds (78904592 bytes)
[ 1035.232608] test 6 (192 bit key, 64 byte blocks): 2053989 operations in 1 
seconds (131455296 bytes)
[ 1036.239289] test 7 (192 bit key, 256 byte blocks): 589591 operations in 1 
seconds (150935296 bytes)
[ 1037.241784] test 8 (192 bit key, 1024 byte blocks): 163565 operations in 1 
seconds (167490560 bytes)
[ 1038.244387] test 9 (192 bit key, 8192 byte blocks): 20899 operations in 1 
seconds (171204608 bytes)
[ 1039.250923] test 10 (256 bit key, 16 byte blocks): 4937343 operations in 1 
seconds (78997488 bytes)
[ 1040.257589] test 11 (256 bit key, 64 byte blocks): 2050678 operations in 1 
seconds (131243392 bytes)
[ 1041.264262] test 12 (256 bit key, 256 byte blocks): 586869 operations in 1 
seconds (150238464 bytes)
[ 1042.270753] test 13 (256 bit key, 1024 byte blocks): 163548 operations in 1 
seconds (167473152 bytes)
[ 1043.277365] test 14 (256 bit key, 8192 byte blocks): 21053 operations in 1 
seconds (172466176 bytes)
[ 1044.283892] 
[ 1044.283892] testing speed of async cbc(twofish) encryption
[ 1044.293349] test 0 (128 bit key, 16 byte blocks): 5186240 operations in 1 
seconds (82979840 bytes)
[ 1045.298534] test 1 (128 bit key, 64 byte blocks): 1921034 operations in 1 
seconds (122946176 bytes)
[ 1046.305207] test 2 (128 bit key, 256 byte blocks): 542787 operations in 1 
seconds (138953472 bytes)
[ 1047.311699] test 3 (128 bit key, 1024 byte blocks): 141399 operations in 1 
seconds (144792576 bytes)
[ 1048.318312] test 4 (128 bit key, 8192 byte blocks): 17755 operations in 1 
seconds (145448960 bytes)
[ 1049.324829] test 5 (192 bit key, 16 byte blocks): 5196441 operations in 1 
seconds (83143056 bytes)
[ 1050.331485] test 6 (192 bit key, 64 byte blocks): 1921456 operations in 1 
seconds (122973184 bytes)
[ 1051.338157] test 7 (192 bit key, 256 byte blocks): 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-16 Thread Borislav Petkov
On Wed, Aug 15, 2012 at 08:34:25PM +0300, Jussi Kivilinna wrote:
 About ~5% slower, probably because I was tuning for sandy-bridge and
 introduced more FPU=CPU register moves.

 Here's new version of patch, with FPU=CPU moves from original
 implementation.

 (Note: also changes encryption function to inline all code in to main
 function, decryption still places common code to separate function to
 reduce object size. This is to measure the difference.)

Yep, looks better than the previous run and also a bit better or on par
with the initial run I did.

The thing is, I'm not sure whether optimizing the thing for each uarch
is a workable solution software-wise or maybe having a single version
which performs sufficiently ok on all uarches is easier/better to
maintain without causing code bloat. Hmmm...

4th:

ran like 1st.

[ 1014.074150] 
[ 1014.074150] testing speed of async ecb(twofish) encryption
[ 1014.083829] test 0 (128 bit key, 16 byte blocks): 4870055 operations in 1 
seconds (77920880 bytes)
[ 1015.092757] test 1 (128 bit key, 64 byte blocks): 2043828 operations in 1 
seconds (130804992 bytes)
[ 1016.099441] test 2 (128 bit key, 256 byte blocks): 606400 operations in 1 
seconds (155238400 bytes)
[ 1017.105939] test 3 (128 bit key, 1024 byte blocks): 168939 operations in 1 
seconds (172993536 bytes)
[ 1018.112517] test 4 (128 bit key, 8192 byte blocks): 21777 operations in 1 
seconds (178397184 bytes)
[ 1019.119035] test 5 (192 bit key, 16 byte blocks): 4882254 operations in 1 
seconds (78116064 bytes)
[ 1020.125716] test 6 (192 bit key, 64 byte blocks): 2043230 operations in 1 
seconds (130766720 bytes)
[ 1021.132391] test 7 (192 bit key, 256 byte blocks): 607477 operations in 1 
seconds (155514112 bytes)
[ 1022.138889] test 8 (192 bit key, 1024 byte blocks): 168743 operations in 1 
seconds (172792832 bytes)
[ 1023.145476] test 9 (192 bit key, 8192 byte blocks): 21442 operations in 1 
seconds (175652864 bytes)
[ 1024.152012] test 10 (256 bit key, 16 byte blocks): 4891863 operations in 1 
seconds (78269808 bytes)
[ 1025.158684] test 11 (256 bit key, 64 byte blocks): 2049390 operations in 1 
seconds (131160960 bytes)
[ 1026.165366] test 12 (256 bit key, 256 byte blocks): 606847 operations in 1 
seconds (155352832 bytes)
[ 1027.171841] test 13 (256 bit key, 1024 byte blocks): 169228 operations in 1 
seconds (173289472 bytes)
[ 1028.178436] test 14 (256 bit key, 8192 byte blocks): 21773 operations in 1 
seconds (178364416 bytes)
[ 1029.184981] 
[ 1029.184981] testing speed of async ecb(twofish) decryption
[ 1029.194508] test 0 (128 bit key, 16 byte blocks): 4931065 operations in 1 
seconds (78897040 bytes)
[ 1030.199640] test 1 (128 bit key, 64 byte blocks): 2056931 operations in 1 
seconds (131643584 bytes)
[ 1031.206303] test 2 (128 bit key, 256 byte blocks): 589409 operations in 1 
seconds (150888704 bytes)
[ 1032.212832] test 3 (128 bit key, 1024 byte blocks): 163681 operations in 1 
seconds (167609344 bytes)
[ 1033.219443] test 4 (128 bit key, 8192 byte blocks): 21062 operations in 1 
seconds (172539904 bytes)
[ 1034.225979] test 5 (192 bit key, 16 byte blocks): 4931537 operations in 1 
seconds (78904592 bytes)
[ 1035.232608] test 6 (192 bit key, 64 byte blocks): 2053989 operations in 1 
seconds (131455296 bytes)
[ 1036.239289] test 7 (192 bit key, 256 byte blocks): 589591 operations in 1 
seconds (150935296 bytes)
[ 1037.241784] test 8 (192 bit key, 1024 byte blocks): 163565 operations in 1 
seconds (167490560 bytes)
[ 1038.244387] test 9 (192 bit key, 8192 byte blocks): 20899 operations in 1 
seconds (171204608 bytes)
[ 1039.250923] test 10 (256 bit key, 16 byte blocks): 4937343 operations in 1 
seconds (78997488 bytes)
[ 1040.257589] test 11 (256 bit key, 64 byte blocks): 2050678 operations in 1 
seconds (131243392 bytes)
[ 1041.264262] test 12 (256 bit key, 256 byte blocks): 586869 operations in 1 
seconds (150238464 bytes)
[ 1042.270753] test 13 (256 bit key, 1024 byte blocks): 163548 operations in 1 
seconds (167473152 bytes)
[ 1043.277365] test 14 (256 bit key, 8192 byte blocks): 21053 operations in 1 
seconds (172466176 bytes)
[ 1044.283892] 
[ 1044.283892] testing speed of async cbc(twofish) encryption
[ 1044.293349] test 0 (128 bit key, 16 byte blocks): 5186240 operations in 1 
seconds (82979840 bytes)
[ 1045.298534] test 1 (128 bit key, 64 byte blocks): 1921034 operations in 1 
seconds (122946176 bytes)
[ 1046.305207] test 2 (128 bit key, 256 byte blocks): 542787 operations in 1 
seconds (138953472 bytes)
[ 1047.311699] test 3 (128 bit key, 1024 byte blocks): 141399 operations in 1 
seconds (144792576 bytes)
[ 1048.318312] test 4 (128 bit key, 8192 byte blocks): 17755 operations in 1 
seconds (145448960 bytes)
[ 1049.324829] test 5 (192 bit key, 16 byte blocks): 5196441 operations in 1 
seconds (83143056 bytes)
[ 1050.331485] test 6 (192 bit key, 64 byte blocks): 1921456 operations in 1 
seconds (122973184 bytes)
[ 1051.338157] test 7 (192 bit key, 256 byte blocks): 543581 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-16 Thread Jussi Kivilinna

Quoting Borislav Petkov b...@alien8.de:


On Wed, Aug 15, 2012 at 08:34:25PM +0300, Jussi Kivilinna wrote:

About ~5% slower, probably because I was tuning for sandy-bridge and
introduced more FPU=CPU register moves.

Here's new version of patch, with FPU=CPU moves from original
implementation.

(Note: also changes encryption function to inline all code in to main
function, decryption still places common code to separate function to
reduce object size. This is to measure the difference.)


Yep, looks better than the previous run and also a bit better or on par
with the initial run I did.


Thanks again. Speed gained with patch is ~8%, and is able of getting  
twofish-avx pass twofish-3way.




The thing is, I'm not sure whether optimizing the thing for each uarch
is a workable solution software-wise or maybe having a single version
which performs sufficiently ok on all uarches is easier/better to
maintain without causing code bloat. Hmmm...


Agreed, testing on multiple CPUs to get single well working version is  
what I have done in the past. But purchasing all the latest CPUs on  
the market isn't option for me, and for testing AVX I'm stuck with  
sandy-bridge :)


-Jussi


4th:

ran like 1st.

[ 1014.074150]
[ 1014.074150] testing speed of async ecb(twofish) encryption
[ 1014.083829] test 0 (128 bit key, 16 byte blocks): 4870055  
operations in 1 seconds (77920880 bytes)
[ 1015.092757] test 1 (128 bit key, 64 byte blocks): 2043828  
operations in 1 seconds (130804992 bytes)
[ 1016.099441] test 2 (128 bit key, 256 byte blocks): 606400  
operations in 1 seconds (155238400 bytes)
[ 1017.105939] test 3 (128 bit key, 1024 byte blocks): 168939  
operations in 1 seconds (172993536 bytes)
[ 1018.112517] test 4 (128 bit key, 8192 byte blocks): 21777  
operations in 1 seconds (178397184 bytes)
[ 1019.119035] test 5 (192 bit key, 16 byte blocks): 4882254  
operations in 1 seconds (78116064 bytes)
[ 1020.125716] test 6 (192 bit key, 64 byte blocks): 2043230  
operations in 1 seconds (130766720 bytes)
[ 1021.132391] test 7 (192 bit key, 256 byte blocks): 607477  
operations in 1 seconds (155514112 bytes)
[ 1022.138889] test 8 (192 bit key, 1024 byte blocks): 168743  
operations in 1 seconds (172792832 bytes)
[ 1023.145476] test 9 (192 bit key, 8192 byte blocks): 21442  
operations in 1 seconds (175652864 bytes)
[ 1024.152012] test 10 (256 bit key, 16 byte blocks): 4891863  
operations in 1 seconds (78269808 bytes)
[ 1025.158684] test 11 (256 bit key, 64 byte blocks): 2049390  
operations in 1 seconds (131160960 bytes)
[ 1026.165366] test 12 (256 bit key, 256 byte blocks): 606847  
operations in 1 seconds (155352832 bytes)
[ 1027.171841] test 13 (256 bit key, 1024 byte blocks): 169228  
operations in 1 seconds (173289472 bytes)
[ 1028.178436] test 14 (256 bit key, 8192 byte blocks): 21773  
operations in 1 seconds (178364416 bytes)

[ 1029.184981]
[ 1029.184981] testing speed of async ecb(twofish) decryption
[ 1029.194508] test 0 (128 bit key, 16 byte blocks): 4931065  
operations in 1 seconds (78897040 bytes)
[ 1030.199640] test 1 (128 bit key, 64 byte blocks): 2056931  
operations in 1 seconds (131643584 bytes)
[ 1031.206303] test 2 (128 bit key, 256 byte blocks): 589409  
operations in 1 seconds (150888704 bytes)
[ 1032.212832] test 3 (128 bit key, 1024 byte blocks): 163681  
operations in 1 seconds (167609344 bytes)
[ 1033.219443] test 4 (128 bit key, 8192 byte blocks): 21062  
operations in 1 seconds (172539904 bytes)
[ 1034.225979] test 5 (192 bit key, 16 byte blocks): 4931537  
operations in 1 seconds (78904592 bytes)
[ 1035.232608] test 6 (192 bit key, 64 byte blocks): 2053989  
operations in 1 seconds (131455296 bytes)
[ 1036.239289] test 7 (192 bit key, 256 byte blocks): 589591  
operations in 1 seconds (150935296 bytes)
[ 1037.241784] test 8 (192 bit key, 1024 byte blocks): 163565  
operations in 1 seconds (167490560 bytes)
[ 1038.244387] test 9 (192 bit key, 8192 byte blocks): 20899  
operations in 1 seconds (171204608 bytes)
[ 1039.250923] test 10 (256 bit key, 16 byte blocks): 4937343  
operations in 1 seconds (78997488 bytes)
[ 1040.257589] test 11 (256 bit key, 64 byte blocks): 2050678  
operations in 1 seconds (131243392 bytes)
[ 1041.264262] test 12 (256 bit key, 256 byte blocks): 586869  
operations in 1 seconds (150238464 bytes)
[ 1042.270753] test 13 (256 bit key, 1024 byte blocks): 163548  
operations in 1 seconds (167473152 bytes)
[ 1043.277365] test 14 (256 bit key, 8192 byte blocks): 21053  
operations in 1 seconds (172466176 bytes)

[ 1044.283892]
[ 1044.283892] testing speed of async cbc(twofish) encryption
[ 1044.293349] test 0 (128 bit key, 16 byte blocks): 5186240  
operations in 1 seconds (82979840 bytes)
[ 1045.298534] test 1 (128 bit key, 64 byte blocks): 1921034  
operations in 1 seconds (122946176 bytes)
[ 1046.305207] test 2 (128 bit key, 256 byte blocks): 542787  
operations in 1 seconds (138953472 bytes)
[ 1047.311699] test 3 (128 bit key, 1024 byte blocks): 141399  

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Jussi Kivilinna
Quoting Borislav Petkov :

> On Wed, Aug 15, 2012 at 05:22:03PM +0300, Jussi Kivilinna wrote:
>
>> Patch replaces 'movb' instructions with 'movzbl' to break false
>> register dependencies and interleaves instructions better for
>> out-of-order scheduling.
>>
>> Also move common round code to separate function to reduce object
>> size.
>
> Ok, redid the first test
>

Thanks.

> $ modprobe twofish-avx-x86_64
> $ modprobe tcrypt mode=504 sec=1
>
> and from quickly juxtaposing the two results, I'd say the patch makes
> things slightly worse but you'd need to run your scripts on it to get
> the accurate results:
>

About ~5% slower, probably because I was tuning for sandy-bridge and introduced
more FPU<=>CPU register moves.

Here's new version of patch, with FPU<=>CPU moves from original implementation.

(Note: also changes encryption function to inline all code in to main function,
decryption still places common code to separate function to reduce object size.
This is to measure the difference.)

-Jussi

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  124 +--
 1 file changed, 77 insertions(+), 47 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S 
b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..d331ab8 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -47,15 +47,22 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RX1 %xmm10
+#define RY1 %xmm11
+
+#define RK1 %xmm12
+#define RK2 %xmm13
+
+#define RT %xmm14
 
 #define RID1  %rax
+#define RID1d %eax
 #define RID1b %al
 #define RID2  %rbx
+#define RID2d %ebx
 #define RID2b %bl
 
 #define RGI1   %rdx
@@ -73,40 +80,48 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   shrq $16,   src; \
movlt0(CTX, RID1, 4), dst ## d;  \
xorlt1(CTX, RID2, 4), dst ## d;  \
-   shrq $16,   src; \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   interleave_op(il_reg);   \
xorlt2(CTX, RID1, 4), dst ## d;  \
xorlt3(CTX, RID2, 4), dst ## d;
 
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+   shrq $16,   reg;
+
 #define G(a, x, t0, t1, t2, t3) \
-   vmovq   a,RGI1;   \
-   vpsrldq $8, a,x;  \
-   vmovq   x,RGI2;   \
+   vmovq   a, RGI1;  \
+   vpextrq $1, a, RGI2;  \
\
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-   shrq $16,   RGI1; \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
+   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1, shr_next, RGI1); \
+   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2, dummy, none); \
shlq $32,   RGS2; \
orq RGS1, RGS2;   \
\
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
-   shrq $16,   RGI2; \
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
-   shlq $32,   RGS3; \
+   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3, shr_next, RGI2); \
+   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1, dummy, none); \
+   shlq $32,   RGS1; \
orq RGS1, RGS3;   \
\
vmovq   RGS2, x;  \
vpinsrq $1, RGS3, x, x;
 
-#define encround(a, b, c, d, x, y) \
-   G(a, x, s0, s1, s2, s3);   \
-   G(b, y, s1, s2, s3, s0);   \
+#define encround_g1g2(a, b, c, d, x, y) \
+   G(a, x, s0, s1, s2, s3); \
+   G(b, y, s1, s2, s3, s0);
+
+#define encround_end(a, b, c, d, x, y) \
+   vpslld $1,  d, RT; \
+   vpsrld $(32 - 1),   d, d;  \
+   vpord, RT,  d; \
vpaddd  x, y,   x; \
vpaddd  y, x,   y; \
vpaddd  x, RK1, x; \
@@ -115,14 +130,16 @@
vpsrld $1,  c, x;  \
vpslld $(32 - 1),   c, c;  \
vporc, x,   c; \
-   vpslld $1,  d, x;  \
-   vpsrld $(32 - 1),   d, d;  \
-   vpord, x,   d; \

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Borislav Petkov
On Wed, Aug 15, 2012 at 05:22:03PM +0300, Jussi Kivilinna wrote:

> Patch replaces 'movb' instructions with 'movzbl' to break false
> register dependencies and interleaves instructions better for
> out-of-order scheduling.
>
> Also move common round code to separate function to reduce object
> size.

Ok, redid the first test

$ modprobe twofish-avx-x86_64
$ modprobe tcrypt mode=504 sec=1

and from quickly juxtaposing the two results, I'd say the patch makes
things slightly worse but you'd need to run your scripts on it to get
the accurate results:

[   98.206067] testing speed of async ecb(twofish) encryption
[   98.214796] test 0 (128 bit key, 16 byte blocks): 4549296 operations in 1 
seconds (72788736 bytes)
[   99.221569] test 1 (128 bit key, 64 byte blocks): 1995934 operations in 1 
seconds (127739776 bytes)
[  100.228250] test 2 (128 bit key, 256 byte blocks): 535040 operations in 1 
seconds (136970240 bytes)
[  101.234751] test 3 (128 bit key, 1024 byte blocks): 148602 operations in 1 
seconds (152168448 bytes)
[  102.241345] test 4 (128 bit key, 8192 byte blocks): 19148 operations in 1 
seconds (156860416 bytes)
[  103.247880] test 5 (192 bit key, 16 byte blocks): 4558391 operations in 1 
seconds (72934256 bytes)
[  104.254547] test 6 (192 bit key, 64 byte blocks): 1997838 operations in 1 
seconds (127861632 bytes)
[  105.261202] test 7 (192 bit key, 256 byte blocks): 534396 operations in 1 
seconds (136805376 bytes)
[  106.267694] test 8 (192 bit key, 1024 byte blocks): 148199 operations in 1 
seconds (151755776 bytes)
[  107.274296] test 9 (192 bit key, 8192 byte blocks): 18913 operations in 1 
seconds (154935296 bytes)
[  108.280824] test 10 (256 bit key, 16 byte blocks): 4595524 operations in 1 
seconds (73528384 bytes)
[  109.287496] test 11 (256 bit key, 64 byte blocks): 1997893 operations in 1 
seconds (127865152 bytes)
[  110.294168] test 12 (256 bit key, 256 byte blocks): 533790 operations in 1 
seconds (136650240 bytes)
[  111.300679] test 13 (256 bit key, 1024 byte blocks): 148787 operations in 1 
seconds (152357888 bytes)
[  112.303561] test 14 (256 bit key, 8192 byte blocks): 19146 operations in 1 
seconds (156844032 bytes)
[  113.310104] 
[  113.310104] testing speed of async ecb(twofish) decryption
[  113.319419] test 0 (128 bit key, 16 byte blocks): 4754043 operations in 1 
seconds (76064688 bytes)
[  114.324768] test 1 (128 bit key, 64 byte blocks): 1831420 operations in 1 
seconds (117210880 bytes)
[  115.331441] test 2 (128 bit key, 256 byte blocks): 541170 operations in 1 
seconds (138539520 bytes)
[  116.337957] test 3 (128 bit key, 1024 byte blocks): 150538 operations in 1 
seconds (154150912 bytes)
[  117.344571] test 4 (128 bit key, 8192 byte blocks): 19397 operations in 1 
seconds (158900224 bytes)
[  118.351122] test 5 (192 bit key, 16 byte blocks): 4753957 operations in 1 
seconds (76063312 bytes)
[  119.357778] test 6 (192 bit key, 64 byte blocks): 1828676 operations in 1 
seconds (117035264 bytes)
[  120.364459] test 7 (192 bit key, 256 byte blocks): 540331 operations in 1 
seconds (138324736 bytes)
[  121.370969] test 8 (192 bit key, 1024 byte blocks): 150348 operations in 1 
seconds (153956352 bytes)
[  122.377573] test 9 (192 bit key, 8192 byte blocks): 19196 operations in 1 
seconds (157253632 bytes)
[  123.384080] test 10 (256 bit key, 16 byte blocks): 4664399 operations in 1 
seconds (74630384 bytes)
[  124.390782] test 11 (256 bit key, 64 byte blocks): 1839324 operations in 1 
seconds (117716736 bytes)
[  125.397463] test 12 (256 bit key, 256 byte blocks): 538735 operations in 1 
seconds (137916160 bytes)
[  126.403962] test 13 (256 bit key, 1024 byte blocks): 150489 operations in 1 
seconds (154100736 bytes)
[  127.410567] test 14 (256 bit key, 8192 byte blocks): 19397 operations in 1 
seconds (158900224 bytes)
[  128.417091] 
[  128.417091] testing speed of async cbc(twofish) encryption
[  128.431227] test 0 (128 bit key, 16 byte blocks): 4681239 operations in 1 
seconds (74899824 bytes)
[  129.439466] test 1 (128 bit key, 64 byte blocks): 1836636 operations in 1 
seconds (117544704 bytes)
[  130.446131] test 2 (128 bit key, 256 byte blocks): 536055 operations in 1 
seconds (137230080 bytes)
[  131.452631] test 3 (128 bit key, 1024 byte blocks): 140955 operations in 1 
seconds (144337920 bytes)
[  132.459243] test 4 (128 bit key, 8192 byte blocks): 17821 operations in 1 
seconds (145989632 bytes)
[  133.466124] test 5 (192 bit key, 16 byte blocks): 4674373 operations in 1 
seconds (74789968 bytes)
[  134.472728] test 6 (192 bit key, 64 byte blocks): 1835821 operations in 1 
seconds (117492544 bytes)
[  135.479374] test 7 (192 bit key, 256 byte blocks): 535882 operations in 1 
seconds (137185792 bytes)
[  136.485876] test 8 (192 bit key, 1024 byte blocks): 140917 operations in 1 
seconds (144299008 bytes)
[  137.492470] test 9 (192 bit key, 8192 byte blocks): 17707 operations in 1 
seconds (145055744 bytes)
[  138.498979] test 10 (256 bit key, 16 byte blocks): 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Jussi Kivilinna
> On Wed, Aug 15, 2012 at 04:48:54PM +0300, Jussi Kivilinna wrote:
> > I posted patch that optimize twofish-avx few weeks ago:
> > http://marc.info/?l=linux-crypto-vger=134364845024825=2
> >
> > I'd be interested to know, if this is patch helps on Bulldozer.
> 
> Sure, can you inline it here too please. The "Download message RAW" link
> on marc.info gives me a diff but patch says:
> 
> patching file arch/x86/crypto/twofish-avx-x86_64-asm_64.S
> patch unexpectedly ends in middle of line
> 
> Thanks.

Here...


Patch replaces 'movb' instructions with 'movzbl' to break false register
dependencies and interleaves instructions better for out-of-order scheduling.

Also move common round code to separate function to reduce object size.

Tested on Core i5-2450M.

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  144 +--
 1 file changed, 92 insertions(+), 52 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S 
b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..42b27b7 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -47,15 +47,22 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RX1 %xmm10
+#define RY1 %xmm11
+
+#define RK1 %xmm12
+#define RK2 %xmm13
+
+#define RT %xmm14
 
 #define RID1  %rax
+#define RID1d %eax
 #define RID1b %al
 #define RID2  %rbx
+#define RID2d %ebx
 #define RID2b %bl
 
 #define RGI1   %rdx
@@ -73,40 +80,45 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   shrq $16,   src; \
movlt0(CTX, RID1, 4), dst ## d;  \
xorlt1(CTX, RID2, 4), dst ## d;  \
-   shrq $16,   src; \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   interleave_op(il_reg);   \
xorlt2(CTX, RID1, 4), dst ## d;  \
xorlt3(CTX, RID2, 4), dst ## d;
 
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+   shrq $16,   reg;
+
 #define G(a, x, t0, t1, t2, t3) \
vmovq   a,RGI1;   \
-   vpsrldq $8, a,x;  \
-   vmovq   x,RGI2;   \
+   vpextrq $1, a,RGI2;   \
\
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-   shrq $16,   RGI1; \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
-   shlq $32,   RGS2; \
-   orq RGS1, RGS2;   \
+   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1, shr_next, RGI1); \
+   vmovd   RGS1d, x;\
+   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2, dummy, none); \
+   vpinsrd $1, RGS2d, x, x; \
\
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
-   shrq $16,   RGI2; \
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
-   shlq $32,   RGS3; \
-   orq RGS1, RGS3;   \
-   \
-   vmovq   RGS2, x;  \
-   vpinsrq $1, RGS3, x, x;
+   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1, shr_next, RGI2); \
+   vpinsrd $2, RGS1d, x, x; \
+   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3, dummy, none); \
+   vpinsrd $3, RGS3d, x, x;
+
+#define encround_g1g2(a, b, c, d, x, y) \
+   G(a, x, s0, s1, s2, s3); \
+   G(b, y, s1, s2, s3, s0);
 
-#define encround(a, b, c, d, x, y) \
-   G(a, x, s0, s1, s2, s3);   \
-   G(b, y, s1, s2, s3, s0);   \
+#define encround_end(a, b, c, d, x, y) \
+   vpslld $1,  d, RT; \
+   vpsrld $(32 - 1),   d, d;  \
+   vpord, RT,  d; \
vpaddd  x, y,   x; \
vpaddd  y, x,   y; \
vpaddd  x, RK1, x; \
@@ -115,14 +127,16 @@
vpsrld $1,  c, x;  \
vpslld $(32 - 1),   c, c;  \
vporc, x,   c; \
-   vpslld $1,  d, x;  \
-   vpsrld $(32 - 1),   d, d;  \
-   vpord, x,   d; \
vpxor   d, y,   d;
 
-#define decround(a, b, c, d, x, y) \
-   G(a, x, s0, s1, s2, s3);   \
-   G(b, y, s1, s2, s3, s0);   

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Borislav Petkov
On Wed, Aug 15, 2012 at 04:48:54PM +0300, Jussi Kivilinna wrote:
> I posted patch that optimize twofish-avx few weeks ago:
> http://marc.info/?l=linux-crypto-vger=134364845024825=2
>
> I'd be interested to know, if this is patch helps on Bulldozer.

Sure, can you inline it here too please. The "Download message RAW" link
on marc.info gives me a diff but patch says:

patching file arch/x86/crypto/twofish-avx-x86_64-asm_64.S
patch unexpectedly ends in middle of line

Thanks.

-- 
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Jussi Kivilinna

Quoting Borislav Petkov :


Ok, here we go. Raw data below.


Thanks alot!

Twofish-avx appears somewhat slower than 3way, ~9% slower with 256byte  
blocks to ~3% slower with 8kb blocks.










Let me know if you need more tests.


I posted patch that optimize twofish-avx few weeks ago:  
http://marc.info/?l=linux-crypto-vger=134364845024825=2


I'd be interested to know, if this is patch helps on Bulldozer.

-Jussi



HTH.

--
Regards/Gruss,
Boris.






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Borislav Petkov
Ok, here we go. Raw data below.

On Wed, Aug 15, 2012 at 02:00:16PM +0300, Jussi Kivilinna wrote:
> >And if you tell me exactly how to run the tests and on what kernel,
> >I'll try to do so.

Ok, the box is a single-socket Bulldozer: "AMD FX(tm)-8100 Eight-Core
Processor stepping 02"; kernel is 3.6.0-rc1+ which is latest Linus +
tip/master merged ontop.

> Twofish-avx (CONFIG_TWOFISH_AVX_X86_64) is available in 3.6-rc1. For

I took CONFIG_CRYPTO_TWOFISH_AVX_X86_64 but I'm pretty sure you meant
that.

> testing you need CRYPTO_TEST build as module. You should turn off
> turbo-core, freq-scaling, etc.

$ for i in $(seq 0 7); do echo "performance" > 
/sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor ; done
$ for i in $(seq 0 7); do echo 0 > /sys/devices/system/cpu/cpu$i/cpufreq/cpb ; 
done

> Testing twofish-avx ('async twofish' speed test):
>  modprobe twofish-avx-x86_64
>  modprobe tcrypt mode=504 sec=1

$ modprobe twofish-avx-x86_64
$ modprobe tcrypt mode=504 sec=1

[  224.672094] 
[  224.672094] testing speed of async ecb(twofish) encryption
[  224.681444] test 0 (128 bit key, 16 byte blocks): 4862478 operations in 1 
seconds (77799648 bytes)
[  225.689190] test 1 (128 bit key, 64 byte blocks): 2040557 operations in 1 
seconds (130595648 bytes)
[  226.695864] test 2 (128 bit key, 256 byte blocks): 564098 operations in 1 
seconds (144409088 bytes)
[  227.702365] test 3 (128 bit key, 1024 byte blocks): 156553 operations in 1 
seconds (160310272 bytes)
[  228.708960] test 4 (128 bit key, 8192 byte blocks): 20128 operations in 1 
seconds (164888576 bytes)
[  229.715485] test 5 (192 bit key, 16 byte blocks): 4853879 operations in 1 
seconds (77662064 bytes)
[  230.722165] test 6 (192 bit key, 64 byte blocks): 2040187 operations in 1 
seconds (130571968 bytes)
[  231.729110] test 7 (192 bit key, 256 byte blocks): 564125 operations in 1 
seconds (144416000 bytes)
[  232.735600] test 8 (192 bit key, 1024 byte blocks): 156231 operations in 1 
seconds (159980544 bytes)
[  233.742205] test 9 (192 bit key, 8192 byte blocks): 19913 operations in 1 
seconds (163127296 bytes)
[  234.748777] test 10 (256 bit key, 16 byte blocks): 4880977 operations in 1 
seconds (78095632 bytes)
[  235.751405] test 11 (256 bit key, 64 byte blocks): 2045621 operations in 1 
seconds (130919744 bytes)
[  236.758079] test 12 (256 bit key, 256 byte blocks): 565273 operations in 1 
seconds (144709888 bytes)
[  237.764579] test 13 (256 bit key, 1024 byte blocks): 156625 operations in 1 
seconds (160384000 bytes)
[  238.771175] test 14 (256 bit key, 8192 byte blocks): 20125 operations in 1 
seconds (164864000 bytes)
[  239.26] 
[  239.26] testing speed of async ecb(twofish) decryption
[  239.787020] test 0 (128 bit key, 16 byte blocks): 4962193 operations in 1 
seconds (79395088 bytes)
[  240.792405] test 1 (128 bit key, 64 byte blocks): 2056765 operations in 1 
seconds (131632960 bytes)
[  241.799070] test 2 (128 bit key, 256 byte blocks): 559384 operations in 1 
seconds (143202304 bytes)
[  242.805568] test 3 (128 bit key, 1024 byte blocks): 153881 operations in 1 
seconds (157574144 bytes)
[  243.812191] test 4 (128 bit key, 8192 byte blocks): 19636 operations in 1 
seconds (160858112 bytes)
[  244.818718] test 5 (192 bit key, 16 byte blocks): 4917689 operations in 1 
seconds (78683024 bytes)
[  245.825408] test 6 (192 bit key, 64 byte blocks): 2056235 operations in 1 
seconds (131599040 bytes)
[  246.832070] test 7 (192 bit key, 256 byte blocks): 560579 operations in 1 
seconds (143508224 bytes)
[  247.838598] test 8 (192 bit key, 1024 byte blocks): 153813 operations in 1 
seconds (157504512 bytes)
[  248.845201] test 9 (192 bit key, 8192 byte blocks): 19411 operations in 1 
seconds (159014912 bytes)
[  249.851755] test 10 (256 bit key, 16 byte blocks): 4932508 operations in 1 
seconds (78920128 bytes)
[  250.858372] test 11 (256 bit key, 64 byte blocks): 2057244 operations in 1 
seconds (131663616 bytes)
[  251.865039] test 12 (256 bit key, 256 byte blocks): 559493 operations in 1 
seconds (143230208 bytes)
[  252.871554] test 13 (256 bit key, 1024 byte blocks): 153980 operations in 1 
seconds (157675520 bytes)
[  253.878159] test 14 (256 bit key, 8192 byte blocks): 19665 operations in 1 
seconds (161095680 bytes)
[  254.884711] 
[  254.884711] testing speed of async cbc(twofish) encryption
[  254.898925] test 0 (128 bit key, 16 byte blocks): 5194404 operations in 1 
seconds (83110464 bytes)
[  255.907087] test 1 (128 bit key, 64 byte blocks): 1916243 operations in 1 
seconds (122639552 bytes)
[  256.913758] test 2 (128 bit key, 256 byte blocks): 541282 operations in 1 
seconds (138568192 bytes)
[  257.916278] test 3 (128 bit key, 1024 byte blocks): 141389 operations in 1 
seconds (144782336 bytes)
[  258.918865] test 4 (128 bit key, 8192 byte blocks): 17811 operations in 1 
seconds (145907712 bytes)
[  259.925372] test 5 (192 bit key, 16 byte blocks): 5176387 operations in 1 
seconds (82822192 bytes)
[  260.932038] test 6 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Jussi Kivilinna

Quoting Borislav Petkov :


On Wed, Aug 15, 2012 at 11:42:16AM +0300, Jussi Kivilinna wrote:

I started thinking about the performance on AMD Bulldozer.
vmovq/vmovd/vpextr*/vpinsr* between FPU and general purpose registers
on AMD CPU is alot slower (latencies from 8 to 12 cycles) than on
Intel sandy-bridge (where instructions have latency of 1 to 2). See:
http://www.agner.org/optimize/instruction_tables.pdf

It would be really good, if implementation could be tested on AMD CPU
to determinate, if it causes performance regression. However I don't
have access to machine with such CPU.


But I do. :)

And if you tell me exactly how to run the tests and on what kernel, I'll
try to do so.



Twofish-avx (CONFIG_TWOFISH_AVX_X86_64) is available in 3.6-rc1. For  
testing you need CRYPTO_TEST build as module. You should turn off  
turbo-core, freq-scaling, etc.


Testing twofish-avx ('async twofish' speed test):
 modprobe twofish-avx-x86_64
 modprobe tcrypt mode=504 sec=1

Testing twofish-x86_64-3way ('sync twofish' speed test):
 modprobe twofish-x86_64-3way
 modprobe tcrypt mode=202 sec=1

Loading tcrypt will block until tests are complete, after which  
modprobe will return with error. This is expected. Results are in  
kernel log.


-Jussi


HTH.

--
Regards/Gruss,
Boris.






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Borislav Petkov
On Wed, Aug 15, 2012 at 11:42:16AM +0300, Jussi Kivilinna wrote:
> I started thinking about the performance on AMD Bulldozer.
> vmovq/vmovd/vpextr*/vpinsr* between FPU and general purpose registers
> on AMD CPU is alot slower (latencies from 8 to 12 cycles) than on
> Intel sandy-bridge (where instructions have latency of 1 to 2). See:
> http://www.agner.org/optimize/instruction_tables.pdf
>
> It would be really good, if implementation could be tested on AMD CPU
> to determinate, if it causes performance regression. However I don't
> have access to machine with such CPU.

But I do. :)

And if you tell me exactly how to run the tests and on what kernel, I'll
try to do so.

HTH.

-- 
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Jussi Kivilinna
Quoting Johannes Goetzfried  
:



This patch adds a x86_64/avx assembler implementation of the Twofish block
cipher. The implementation processes eight blocks in parallel (two 4 block
chunk AVX operations). The table-lookups are done in general-purpose  
registers.

For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way
module are called. A good performance increase is provided for blocksizes
greater or equal to 128B.

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmark results:

Intel Core i5-2500 CPU (fam:6, model:42, step:7)


I started thinking about the performance on AMD Bulldozer.  
vmovq/vmovd/vpextr*/vpinsr* between FPU and general purpose registers  
on AMD CPU is alot slower (latencies from 8 to 12 cycles) than on  
Intel sandy-bridge (where instructions have latency of 1 to 2). See:  
http://www.agner.org/optimize/instruction_tables.pdf


It would be really good, if implementation could be tested on AMD CPU  
to determinate, if it causes performance regression. However I don't  
have access to machine with such CPU.


-Jussi

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Jussi Kivilinna
Quoting Johannes Goetzfried  
johannes.goetzfr...@informatik.stud.uni-erlangen.de:



This patch adds a x86_64/avx assembler implementation of the Twofish block
cipher. The implementation processes eight blocks in parallel (two 4 block
chunk AVX operations). The table-lookups are done in general-purpose  
registers.

For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way
module are called. A good performance increase is provided for blocksizes
greater or equal to 128B.

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmark results:

Intel Core i5-2500 CPU (fam:6, model:42, step:7)


I started thinking about the performance on AMD Bulldozer.  
vmovq/vmovd/vpextr*/vpinsr* between FPU and general purpose registers  
on AMD CPU is alot slower (latencies from 8 to 12 cycles) than on  
Intel sandy-bridge (where instructions have latency of 1 to 2). See:  
http://www.agner.org/optimize/instruction_tables.pdf


It would be really good, if implementation could be tested on AMD CPU  
to determinate, if it causes performance regression. However I don't  
have access to machine with such CPU.


-Jussi

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Borislav Petkov
On Wed, Aug 15, 2012 at 11:42:16AM +0300, Jussi Kivilinna wrote:
 I started thinking about the performance on AMD Bulldozer.
 vmovq/vmovd/vpextr*/vpinsr* between FPU and general purpose registers
 on AMD CPU is alot slower (latencies from 8 to 12 cycles) than on
 Intel sandy-bridge (where instructions have latency of 1 to 2). See:
 http://www.agner.org/optimize/instruction_tables.pdf

 It would be really good, if implementation could be tested on AMD CPU
 to determinate, if it causes performance regression. However I don't
 have access to machine with such CPU.

But I do. :)

And if you tell me exactly how to run the tests and on what kernel, I'll
try to do so.

HTH.

-- 
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Jussi Kivilinna

Quoting Borislav Petkov b...@alien8.de:


On Wed, Aug 15, 2012 at 11:42:16AM +0300, Jussi Kivilinna wrote:

I started thinking about the performance on AMD Bulldozer.
vmovq/vmovd/vpextr*/vpinsr* between FPU and general purpose registers
on AMD CPU is alot slower (latencies from 8 to 12 cycles) than on
Intel sandy-bridge (where instructions have latency of 1 to 2). See:
http://www.agner.org/optimize/instruction_tables.pdf

It would be really good, if implementation could be tested on AMD CPU
to determinate, if it causes performance regression. However I don't
have access to machine with such CPU.


But I do. :)

And if you tell me exactly how to run the tests and on what kernel, I'll
try to do so.



Twofish-avx (CONFIG_TWOFISH_AVX_X86_64) is available in 3.6-rc1. For  
testing you need CRYPTO_TEST build as module. You should turn off  
turbo-core, freq-scaling, etc.


Testing twofish-avx ('async twofish' speed test):
 modprobe twofish-avx-x86_64
 modprobe tcrypt mode=504 sec=1

Testing twofish-x86_64-3way ('sync twofish' speed test):
 modprobe twofish-x86_64-3way
 modprobe tcrypt mode=202 sec=1

Loading tcrypt will block until tests are complete, after which  
modprobe will return with error. This is expected. Results are in  
kernel log.


-Jussi


HTH.

--
Regards/Gruss,
Boris.






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Borislav Petkov
Ok, here we go. Raw data below.

On Wed, Aug 15, 2012 at 02:00:16PM +0300, Jussi Kivilinna wrote:
 And if you tell me exactly how to run the tests and on what kernel,
 I'll try to do so.

Ok, the box is a single-socket Bulldozer: AMD FX(tm)-8100 Eight-Core
Processor stepping 02; kernel is 3.6.0-rc1+ which is latest Linus +
tip/master merged ontop.

 Twofish-avx (CONFIG_TWOFISH_AVX_X86_64) is available in 3.6-rc1. For

I took CONFIG_CRYPTO_TWOFISH_AVX_X86_64 but I'm pretty sure you meant
that.

 testing you need CRYPTO_TEST build as module. You should turn off
 turbo-core, freq-scaling, etc.

$ for i in $(seq 0 7); do echo performance  
/sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor ; done
$ for i in $(seq 0 7); do echo 0  /sys/devices/system/cpu/cpu$i/cpufreq/cpb ; 
done

 Testing twofish-avx ('async twofish' speed test):
  modprobe twofish-avx-x86_64
  modprobe tcrypt mode=504 sec=1

$ modprobe twofish-avx-x86_64
$ modprobe tcrypt mode=504 sec=1

[  224.672094] 
[  224.672094] testing speed of async ecb(twofish) encryption
[  224.681444] test 0 (128 bit key, 16 byte blocks): 4862478 operations in 1 
seconds (77799648 bytes)
[  225.689190] test 1 (128 bit key, 64 byte blocks): 2040557 operations in 1 
seconds (130595648 bytes)
[  226.695864] test 2 (128 bit key, 256 byte blocks): 564098 operations in 1 
seconds (144409088 bytes)
[  227.702365] test 3 (128 bit key, 1024 byte blocks): 156553 operations in 1 
seconds (160310272 bytes)
[  228.708960] test 4 (128 bit key, 8192 byte blocks): 20128 operations in 1 
seconds (164888576 bytes)
[  229.715485] test 5 (192 bit key, 16 byte blocks): 4853879 operations in 1 
seconds (77662064 bytes)
[  230.722165] test 6 (192 bit key, 64 byte blocks): 2040187 operations in 1 
seconds (130571968 bytes)
[  231.729110] test 7 (192 bit key, 256 byte blocks): 564125 operations in 1 
seconds (144416000 bytes)
[  232.735600] test 8 (192 bit key, 1024 byte blocks): 156231 operations in 1 
seconds (159980544 bytes)
[  233.742205] test 9 (192 bit key, 8192 byte blocks): 19913 operations in 1 
seconds (163127296 bytes)
[  234.748777] test 10 (256 bit key, 16 byte blocks): 4880977 operations in 1 
seconds (78095632 bytes)
[  235.751405] test 11 (256 bit key, 64 byte blocks): 2045621 operations in 1 
seconds (130919744 bytes)
[  236.758079] test 12 (256 bit key, 256 byte blocks): 565273 operations in 1 
seconds (144709888 bytes)
[  237.764579] test 13 (256 bit key, 1024 byte blocks): 156625 operations in 1 
seconds (160384000 bytes)
[  238.771175] test 14 (256 bit key, 8192 byte blocks): 20125 operations in 1 
seconds (164864000 bytes)
[  239.26] 
[  239.26] testing speed of async ecb(twofish) decryption
[  239.787020] test 0 (128 bit key, 16 byte blocks): 4962193 operations in 1 
seconds (79395088 bytes)
[  240.792405] test 1 (128 bit key, 64 byte blocks): 2056765 operations in 1 
seconds (131632960 bytes)
[  241.799070] test 2 (128 bit key, 256 byte blocks): 559384 operations in 1 
seconds (143202304 bytes)
[  242.805568] test 3 (128 bit key, 1024 byte blocks): 153881 operations in 1 
seconds (157574144 bytes)
[  243.812191] test 4 (128 bit key, 8192 byte blocks): 19636 operations in 1 
seconds (160858112 bytes)
[  244.818718] test 5 (192 bit key, 16 byte blocks): 4917689 operations in 1 
seconds (78683024 bytes)
[  245.825408] test 6 (192 bit key, 64 byte blocks): 2056235 operations in 1 
seconds (131599040 bytes)
[  246.832070] test 7 (192 bit key, 256 byte blocks): 560579 operations in 1 
seconds (143508224 bytes)
[  247.838598] test 8 (192 bit key, 1024 byte blocks): 153813 operations in 1 
seconds (157504512 bytes)
[  248.845201] test 9 (192 bit key, 8192 byte blocks): 19411 operations in 1 
seconds (159014912 bytes)
[  249.851755] test 10 (256 bit key, 16 byte blocks): 4932508 operations in 1 
seconds (78920128 bytes)
[  250.858372] test 11 (256 bit key, 64 byte blocks): 2057244 operations in 1 
seconds (131663616 bytes)
[  251.865039] test 12 (256 bit key, 256 byte blocks): 559493 operations in 1 
seconds (143230208 bytes)
[  252.871554] test 13 (256 bit key, 1024 byte blocks): 153980 operations in 1 
seconds (157675520 bytes)
[  253.878159] test 14 (256 bit key, 8192 byte blocks): 19665 operations in 1 
seconds (161095680 bytes)
[  254.884711] 
[  254.884711] testing speed of async cbc(twofish) encryption
[  254.898925] test 0 (128 bit key, 16 byte blocks): 5194404 operations in 1 
seconds (83110464 bytes)
[  255.907087] test 1 (128 bit key, 64 byte blocks): 1916243 operations in 1 
seconds (122639552 bytes)
[  256.913758] test 2 (128 bit key, 256 byte blocks): 541282 operations in 1 
seconds (138568192 bytes)
[  257.916278] test 3 (128 bit key, 1024 byte blocks): 141389 operations in 1 
seconds (144782336 bytes)
[  258.918865] test 4 (128 bit key, 8192 byte blocks): 17811 operations in 1 
seconds (145907712 bytes)
[  259.925372] test 5 (192 bit key, 16 byte blocks): 5176387 operations in 1 
seconds (82822192 bytes)
[  260.932038] test 6 (192 bit key, 64 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Jussi Kivilinna

Quoting Borislav Petkov b...@alien8.de:


Ok, here we go. Raw data below.


Thanks alot!

Twofish-avx appears somewhat slower than 3way, ~9% slower with 256byte  
blocks to ~3% slower with 8kb blocks.






snip



Let me know if you need more tests.


I posted patch that optimize twofish-avx few weeks ago:  
http://marc.info/?l=linux-crypto-vgerm=134364845024825w=2


I'd be interested to know, if this is patch helps on Bulldozer.

-Jussi



HTH.

--
Regards/Gruss,
Boris.






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Borislav Petkov
On Wed, Aug 15, 2012 at 04:48:54PM +0300, Jussi Kivilinna wrote:
 I posted patch that optimize twofish-avx few weeks ago:
 http://marc.info/?l=linux-crypto-vgerm=134364845024825w=2

 I'd be interested to know, if this is patch helps on Bulldozer.

Sure, can you inline it here too please. The Download message RAW link
on marc.info gives me a diff but patch says:

patching file arch/x86/crypto/twofish-avx-x86_64-asm_64.S
patch unexpectedly ends in middle of line

Thanks.

-- 
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Jussi Kivilinna
 On Wed, Aug 15, 2012 at 04:48:54PM +0300, Jussi Kivilinna wrote:
  I posted patch that optimize twofish-avx few weeks ago:
  http://marc.info/?l=linux-crypto-vgerm=134364845024825w=2
 
  I'd be interested to know, if this is patch helps on Bulldozer.
 
 Sure, can you inline it here too please. The Download message RAW link
 on marc.info gives me a diff but patch says:
 
 patching file arch/x86/crypto/twofish-avx-x86_64-asm_64.S
 patch unexpectedly ends in middle of line
 
 Thanks.

Here...


Patch replaces 'movb' instructions with 'movzbl' to break false register
dependencies and interleaves instructions better for out-of-order scheduling.

Also move common round code to separate function to reduce object size.

Tested on Core i5-2450M.

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  144 +--
 1 file changed, 92 insertions(+), 52 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S 
b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..42b27b7 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -47,15 +47,22 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RX1 %xmm10
+#define RY1 %xmm11
+
+#define RK1 %xmm12
+#define RK2 %xmm13
+
+#define RT %xmm14
 
 #define RID1  %rax
+#define RID1d %eax
 #define RID1b %al
 #define RID2  %rbx
+#define RID2d %ebx
 #define RID2b %bl
 
 #define RGI1   %rdx
@@ -73,40 +80,45 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   shrq $16,   src; \
movlt0(CTX, RID1, 4), dst ## d;  \
xorlt1(CTX, RID2, 4), dst ## d;  \
-   shrq $16,   src; \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   interleave_op(il_reg);   \
xorlt2(CTX, RID1, 4), dst ## d;  \
xorlt3(CTX, RID2, 4), dst ## d;
 
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+   shrq $16,   reg;
+
 #define G(a, x, t0, t1, t2, t3) \
vmovq   a,RGI1;   \
-   vpsrldq $8, a,x;  \
-   vmovq   x,RGI2;   \
+   vpextrq $1, a,RGI2;   \
\
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-   shrq $16,   RGI1; \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
-   shlq $32,   RGS2; \
-   orq RGS1, RGS2;   \
+   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1, shr_next, RGI1); \
+   vmovd   RGS1d, x;\
+   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2, dummy, none); \
+   vpinsrd $1, RGS2d, x, x; \
\
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
-   shrq $16,   RGI2; \
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
-   shlq $32,   RGS3; \
-   orq RGS1, RGS3;   \
-   \
-   vmovq   RGS2, x;  \
-   vpinsrq $1, RGS3, x, x;
+   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1, shr_next, RGI2); \
+   vpinsrd $2, RGS1d, x, x; \
+   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3, dummy, none); \
+   vpinsrd $3, RGS3d, x, x;
+
+#define encround_g1g2(a, b, c, d, x, y) \
+   G(a, x, s0, s1, s2, s3); \
+   G(b, y, s1, s2, s3, s0);
 
-#define encround(a, b, c, d, x, y) \
-   G(a, x, s0, s1, s2, s3);   \
-   G(b, y, s1, s2, s3, s0);   \
+#define encround_end(a, b, c, d, x, y) \
+   vpslld $1,  d, RT; \
+   vpsrld $(32 - 1),   d, d;  \
+   vpord, RT,  d; \
vpaddd  x, y,   x; \
vpaddd  y, x,   y; \
vpaddd  x, RK1, x; \
@@ -115,14 +127,16 @@
vpsrld $1,  c, x;  \
vpslld $(32 - 1),   c, c;  \
vporc, x,   c; \
-   vpslld $1,  d, x;  \
-   vpsrld $(32 - 1),   d, d;  \
-   vpord, x,   d; \
vpxor   d, y,   d;
 
-#define decround(a, b, c, d, x, y) \
-   G(a, x, s0, s1, s2, s3);   \
-   G(b, y, s1, s2, s3, s0);   \
+#define 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Borislav Petkov
On Wed, Aug 15, 2012 at 05:22:03PM +0300, Jussi Kivilinna wrote:

 Patch replaces 'movb' instructions with 'movzbl' to break false
 register dependencies and interleaves instructions better for
 out-of-order scheduling.

 Also move common round code to separate function to reduce object
 size.

Ok, redid the first test

$ modprobe twofish-avx-x86_64
$ modprobe tcrypt mode=504 sec=1

and from quickly juxtaposing the two results, I'd say the patch makes
things slightly worse but you'd need to run your scripts on it to get
the accurate results:

[   98.206067] testing speed of async ecb(twofish) encryption
[   98.214796] test 0 (128 bit key, 16 byte blocks): 4549296 operations in 1 
seconds (72788736 bytes)
[   99.221569] test 1 (128 bit key, 64 byte blocks): 1995934 operations in 1 
seconds (127739776 bytes)
[  100.228250] test 2 (128 bit key, 256 byte blocks): 535040 operations in 1 
seconds (136970240 bytes)
[  101.234751] test 3 (128 bit key, 1024 byte blocks): 148602 operations in 1 
seconds (152168448 bytes)
[  102.241345] test 4 (128 bit key, 8192 byte blocks): 19148 operations in 1 
seconds (156860416 bytes)
[  103.247880] test 5 (192 bit key, 16 byte blocks): 4558391 operations in 1 
seconds (72934256 bytes)
[  104.254547] test 6 (192 bit key, 64 byte blocks): 1997838 operations in 1 
seconds (127861632 bytes)
[  105.261202] test 7 (192 bit key, 256 byte blocks): 534396 operations in 1 
seconds (136805376 bytes)
[  106.267694] test 8 (192 bit key, 1024 byte blocks): 148199 operations in 1 
seconds (151755776 bytes)
[  107.274296] test 9 (192 bit key, 8192 byte blocks): 18913 operations in 1 
seconds (154935296 bytes)
[  108.280824] test 10 (256 bit key, 16 byte blocks): 4595524 operations in 1 
seconds (73528384 bytes)
[  109.287496] test 11 (256 bit key, 64 byte blocks): 1997893 operations in 1 
seconds (127865152 bytes)
[  110.294168] test 12 (256 bit key, 256 byte blocks): 533790 operations in 1 
seconds (136650240 bytes)
[  111.300679] test 13 (256 bit key, 1024 byte blocks): 148787 operations in 1 
seconds (152357888 bytes)
[  112.303561] test 14 (256 bit key, 8192 byte blocks): 19146 operations in 1 
seconds (156844032 bytes)
[  113.310104] 
[  113.310104] testing speed of async ecb(twofish) decryption
[  113.319419] test 0 (128 bit key, 16 byte blocks): 4754043 operations in 1 
seconds (76064688 bytes)
[  114.324768] test 1 (128 bit key, 64 byte blocks): 1831420 operations in 1 
seconds (117210880 bytes)
[  115.331441] test 2 (128 bit key, 256 byte blocks): 541170 operations in 1 
seconds (138539520 bytes)
[  116.337957] test 3 (128 bit key, 1024 byte blocks): 150538 operations in 1 
seconds (154150912 bytes)
[  117.344571] test 4 (128 bit key, 8192 byte blocks): 19397 operations in 1 
seconds (158900224 bytes)
[  118.351122] test 5 (192 bit key, 16 byte blocks): 4753957 operations in 1 
seconds (76063312 bytes)
[  119.357778] test 6 (192 bit key, 64 byte blocks): 1828676 operations in 1 
seconds (117035264 bytes)
[  120.364459] test 7 (192 bit key, 256 byte blocks): 540331 operations in 1 
seconds (138324736 bytes)
[  121.370969] test 8 (192 bit key, 1024 byte blocks): 150348 operations in 1 
seconds (153956352 bytes)
[  122.377573] test 9 (192 bit key, 8192 byte blocks): 19196 operations in 1 
seconds (157253632 bytes)
[  123.384080] test 10 (256 bit key, 16 byte blocks): 4664399 operations in 1 
seconds (74630384 bytes)
[  124.390782] test 11 (256 bit key, 64 byte blocks): 1839324 operations in 1 
seconds (117716736 bytes)
[  125.397463] test 12 (256 bit key, 256 byte blocks): 538735 operations in 1 
seconds (137916160 bytes)
[  126.403962] test 13 (256 bit key, 1024 byte blocks): 150489 operations in 1 
seconds (154100736 bytes)
[  127.410567] test 14 (256 bit key, 8192 byte blocks): 19397 operations in 1 
seconds (158900224 bytes)
[  128.417091] 
[  128.417091] testing speed of async cbc(twofish) encryption
[  128.431227] test 0 (128 bit key, 16 byte blocks): 4681239 operations in 1 
seconds (74899824 bytes)
[  129.439466] test 1 (128 bit key, 64 byte blocks): 1836636 operations in 1 
seconds (117544704 bytes)
[  130.446131] test 2 (128 bit key, 256 byte blocks): 536055 operations in 1 
seconds (137230080 bytes)
[  131.452631] test 3 (128 bit key, 1024 byte blocks): 140955 operations in 1 
seconds (144337920 bytes)
[  132.459243] test 4 (128 bit key, 8192 byte blocks): 17821 operations in 1 
seconds (145989632 bytes)
[  133.466124] test 5 (192 bit key, 16 byte blocks): 4674373 operations in 1 
seconds (74789968 bytes)
[  134.472728] test 6 (192 bit key, 64 byte blocks): 1835821 operations in 1 
seconds (117492544 bytes)
[  135.479374] test 7 (192 bit key, 256 byte blocks): 535882 operations in 1 
seconds (137185792 bytes)
[  136.485876] test 8 (192 bit key, 1024 byte blocks): 140917 operations in 1 
seconds (144299008 bytes)
[  137.492470] test 9 (192 bit key, 8192 byte blocks): 17707 operations in 1 
seconds (145055744 bytes)
[  138.498979] test 10 (256 bit key, 16 byte blocks): 4674648 

Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

2012-08-15 Thread Jussi Kivilinna
Quoting Borislav Petkov b...@alien8.de:

 On Wed, Aug 15, 2012 at 05:22:03PM +0300, Jussi Kivilinna wrote:

 Patch replaces 'movb' instructions with 'movzbl' to break false
 register dependencies and interleaves instructions better for
 out-of-order scheduling.

 Also move common round code to separate function to reduce object
 size.

 Ok, redid the first test


Thanks.

 $ modprobe twofish-avx-x86_64
 $ modprobe tcrypt mode=504 sec=1

 and from quickly juxtaposing the two results, I'd say the patch makes
 things slightly worse but you'd need to run your scripts on it to get
 the accurate results:


About ~5% slower, probably because I was tuning for sandy-bridge and introduced
more FPU=CPU register moves.

Here's new version of patch, with FPU=CPU moves from original implementation.

(Note: also changes encryption function to inline all code in to main function,
decryption still places common code to separate function to reduce object size.
This is to measure the difference.)

-Jussi

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  124 +--
 1 file changed, 77 insertions(+), 47 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S 
b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..d331ab8 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -47,15 +47,22 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RX1 %xmm10
+#define RY1 %xmm11
+
+#define RK1 %xmm12
+#define RK2 %xmm13
+
+#define RT %xmm14
 
 #define RID1  %rax
+#define RID1d %eax
 #define RID1b %al
 #define RID2  %rbx
+#define RID2d %ebx
 #define RID2b %bl
 
 #define RGI1   %rdx
@@ -73,40 +80,48 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   shrq $16,   src; \
movlt0(CTX, RID1, 4), dst ## d;  \
xorlt1(CTX, RID2, 4), dst ## d;  \
-   shrq $16,   src; \
-   movbsrc ## bl,RID1b; \
-   movbsrc ## bh,RID2b; \
+   movzbl  src ## bl,RID1d; \
+   movzbl  src ## bh,RID2d; \
+   interleave_op(il_reg);   \
xorlt2(CTX, RID1, 4), dst ## d;  \
xorlt3(CTX, RID2, 4), dst ## d;
 
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+   shrq $16,   reg;
+
 #define G(a, x, t0, t1, t2, t3) \
-   vmovq   a,RGI1;   \
-   vpsrldq $8, a,x;  \
-   vmovq   x,RGI2;   \
+   vmovq   a, RGI1;  \
+   vpextrq $1, a, RGI2;  \
\
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-   shrq $16,   RGI1; \
-   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
+   lookup_32bit(t0, t1, t2, t3, RGI1, RGS1, shr_next, RGI1); \
+   lookup_32bit(t0, t1, t2, t3, RGI1, RGS2, dummy, none); \
shlq $32,   RGS2; \
orq RGS1, RGS2;   \
\
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
-   shrq $16,   RGI2; \
-   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
-   shlq $32,   RGS3; \
+   lookup_32bit(t0, t1, t2, t3, RGI2, RGS3, shr_next, RGI2); \
+   lookup_32bit(t0, t1, t2, t3, RGI2, RGS1, dummy, none); \
+   shlq $32,   RGS1; \
orq RGS1, RGS3;   \
\
vmovq   RGS2, x;  \
vpinsrq $1, RGS3, x, x;
 
-#define encround(a, b, c, d, x, y) \
-   G(a, x, s0, s1, s2, s3);   \
-   G(b, y, s1, s2, s3, s0);   \
+#define encround_g1g2(a, b, c, d, x, y) \
+   G(a, x, s0, s1, s2, s3); \
+   G(b, y, s1, s2, s3, s0);
+
+#define encround_end(a, b, c, d, x, y) \
+   vpslld $1,  d, RT; \
+   vpsrld $(32 - 1),   d, d;  \
+   vpord, RT,  d; \
vpaddd  x, y,   x; \
vpaddd  y, x,   y; \
vpaddd  x, RK1, x; \
@@ -115,14 +130,16 @@
vpsrld $1,  c, x;  \
vpslld $(32 - 1),   c, c;  \
vporc, x,   c; \
-   vpslld $1,  d, x;  \
-   vpsrld $(32 - 1),   d, d;  \
-   vpord, x,   d; \
vpxor