[PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation

2009-09-15 Thread Huang Ying
PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
carry-less multiplication. More information about PCLMULQDQ can be
found at:

http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/

Because PCLMULQDQ changes XMM state, its usage must be enclosed with
kernel_fpu_begin/end, which can be used only in process context, the
acceleration is implemented as crypto_ahash. That is, request in soft
IRQ context will be defered to the cryptd kernel thread.

v4:
 - Fix some style issues.

v3:
 - Revise GHASH implementation, performance increase about 2x.

Signed-off-by: Huang Ying 
---
 arch/x86/crypto/Makefile   |3 
 arch/x86/crypto/ghash-clmulni-intel_asm.S  |  157 +
 arch/x86/crypto/ghash-clmulni-intel_glue.c |  333 +
 arch/x86/include/asm/cpufeature.h  |1 
 crypto/Kconfig |8 
 crypto/cryptd.c|7 
 include/crypto/cryptd.h|1 
 7 files changed, 510 insertions(+)
 create mode 100644 arch/x86/crypto/ghash-clmulni-intel_asm.S
 create mode 100644 arch/x86/crypto/ghash-clmulni-intel_glue.c

--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
+obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 
@@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_6
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+
+ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,157 @@
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains accelerated part of ghash
+ * implementation. More information about PCLMULQDQ can be found at:
+ *
+ * 
http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ * Copyright (c) 2009 Intel Corp.
+ *   Author: Huang Ying 
+ *  Vinodh Gopal
+ *  Erdinc Ozturk
+ *  Deniz Karakoyunlu
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include 
+
+.align 16
+.Lbswap_mask:
+   .octa 0x000102030405060708090a0b0c0d0e0f
+.Lpoly:
+   .octa 0xc201
+.Ltwo_one:
+   .octa 0x00010001
+
+#define DATA   %xmm0
+#define SHASH  %xmm1
+#define T1 %xmm2
+#define T2 %xmm3
+#define T3 %xmm4
+#define BSWAP  %xmm5
+#define IN1%xmm6
+
+.text
+
+/*
+ * __clmul_gf128mul_ble:   internal ABI
+ * input:
+ * DATA:   operand1
+ * SHASH:  operand2, hash_key << 1 mod poly
+ * output:
+ * DATA:   operand1 * operand2 mod poly
+ * changed:
+ * T1
+ * T2
+ * T3
+ */
+__clmul_gf128mul_ble:
+   movaps DATA, T1
+   pshufd $0b01001110, DATA, T2
+   pshufd $0b01001110, SHASH, T3
+   pxor DATA, T2
+   pxor SHASH, T3
+
+   # pclmulqdq $0x00, SHASH, DATA  # DATA = a0 * b0
+   .byte 0x66, 0x0f, 0x3a, 0x44, 0xc1, 0x00
+   # pclmulqdq $0x11, SHASH, T1# T1 = a1 * b1
+   .byte 0x66, 0x0f, 0x3a, 0x44, 0xd1, 0x11
+   # pclmulqdq $0x00, T3, T2   # T2 = (a1 + a0) * (b1 + b0)
+   .byte 0x66, 0x0f, 0x3a, 0x44, 0xdc, 0x00
+   pxor DATA, T2
+   pxor T1, T2 # T2 = a0 * b1 + a1 * b0
+
+   movaps T2, T3
+   pslldq $8, T3
+   psrldq $8, T2
+   pxor T3, DATA
+   pxor T2, T1 #  is result of
+   # carry-less multiplication
+
+   # first phase of the reduction
+   movaps DATA, T3
+   psllq $1, T3
+   pxor DATA, T3
+   psllq $5, T3
+   pxor DATA, T3
+   psllq $57, T3
+   movaps T3, T2
+   pslldq $8, T2
+   psrldq $8, T3
+   pxor T2, DATA
+   pxor T3, T1
+
+   # second phase of the reduction
+   movaps DATA, T2
+   psrlq $5, T2
+   pxor DATA, T2
+   psrlq $1, T2
+   pxor DATA, T2
+   psrlq $1, T2
+   pxor T2, T1
+   pxor T1, DATA
+   ret
+
+/* void clmul_ghash_mul(char *dst, const be128 *shash) */
+ENTRY(clmul_ghash_mul)
+   movups (%rdi), DATA
+   movups (%rsi), SHASH
+   movaps .Lbswap_mask, BSWAP
+   pshufb BSWAP, DATA
+   call __clmul_gf128mul_ble
+   pshufb BSWAP, DATA
+   movups DATA, (%rdi)
+   ret
+
+/*
+ * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ *

Re: [PATCH -v3] crypto: Add PCLMULQDQ accelerated GHASH implementation

2009-09-15 Thread Huang Ying
On Tue, 2009-09-15 at 22:42 +0800, Daniel Walker wrote: 
> On Tue, 2009-09-15 at 13:42 +0800, Huang Ying wrote:
> > Hi, Herbert,
> > 
> > The dependency to irq_fpu_usable has been merged by linus' tree.
> > 
> > Best Regards,
> > Huang Ying
> > -->
> > PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
> > carry-less multiplication. More information about PCLMULQDQ can be
> > found at:
> > 
> > http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
> > 
> > Because PCLMULQDQ changes XMM state, its usage must be enclosed with
> > kernel_fpu_begin/end, which can be used only in process context, the
> > acceleration is implemented as crypto_ahash. That is, request in soft
> > IRQ context will be defered to the cryptd kernel thread.
> > 
> > v3:
> >  - Revise GHASH implementation, performance increase about 2x.
> 
> 
> You have three style issues in this patch, for instance you shouldn't
> set values inside if() statements .. If you run checkpatch on this it
> will tell you all the style problems. Could you run this through
> checkpatch and fix any errors it's find ?

Thanks, I will fix them.

Best Regards,
Huang Ying

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH]: fix repetition test for hardware RNG to be FIPS compliant (v2)

2009-09-15 Thread Sebastian Andrzej Siewior
* Neil Horman | 2009-09-14 12:30:43 [-0400]:

>Ok, version 2 of the patch, taking comments into account
looks good.

Sebastian
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ESP hardware acceleration

2009-09-15 Thread Octavian Purdila
On Tuesday 15 September 2009 20:12:52 you wrote:

> > However, I think that the best results for hw accel will be obtained if
> > you accelerate the AEAD interface.
> 
> If your driver benefits from seeing both the hashing request and the
> cipher request at the same time then by all means go for the AEAD
> interface.  But don't feel compelled to use it just because it's
> there :)

I think this interface has the advantage of doing only one DMA transfer per 
ESP packet instead of two such transfers required when using separate encr + 
auth. (of course this may not matter at all on some architectures)

> > Speaking of hw accel, we are also playing with it and we got moderately
> > good results. We are now running into two major software bottlenecks:
> > memcpy (because of the copy required by TCP traffic) and CRC computation.
> 
> What platform is this? 
> 

Its a ppc750 CPU clocked at 1GHz - pretty low end compared with today's 
hardware. We were able to get about 360Mbits L2 throughput (TCP traffic) with 
our hw accel engine although theoretically the hw engine can go up much higher 
(and profiling the the hw engine itself shows that it is significantly idle).

>  And where does CRC come into this?

Sorry, what I meant was TCP checksum. 

tavi





--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ESP hardware acceleration

2009-09-15 Thread Herbert Xu
Octavian Purdila  wrote:
>
> AFAK, the crypto interface is asynchronous but the hashing interface (as used 
> in IPSec) is synchronous.
> 
> There are two patches I've recently seen on the list, one for converting to 
> async hashing and one for parallel crypto/ipsec which will probably get in 
> 2.6.32.

Yes they're now in Linus's tree so both hsahing and ciphers are
now async.

> However, I think that the best results for hw accel will be obtained if you 
> accelerate the AEAD interface.

If your driver benefits from seeing both the hashing request and the
cipher request at the same time then by all means go for the AEAD
interface.  But don't feel compelled to use it just because it's
there :)

> Speaking of hw accel, we are also playing with it and we got moderately good 
> results. We are now running into two major software bottlenecks: memcpy 
> (because of the copy required by TCP traffic) and CRC computation.

What platform is this? And where does CRC come into this?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ESP hardware acceleration

2009-09-15 Thread Herbert Xu
Dimitrios Siganos  wrote:
> 
> What I would like to know is:
> 1) does the xfrm/ESP implementation support asynchronous/parallel packet 
> operation?
> 2) If yes, does it support it in both directions (tx/rx)?

Yes on both counts.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ESP hardware acceleration

2009-09-15 Thread Octavian Purdila
On Tuesday 15 September 2009 16:19:27 you wrote:
> Hi,
> 
> We are using linux-2.6.28 and we would like to hardware accelerate the
> NETKEY IPsec traffic. We are using strongswan for the upper layers.
> 
> I understand that strongswan uses the Linux/NETKEY IPsec implementation,
> which in turn, uses the Linux Scatterlist Crypto API for all its
> cryptographic work. To hardware accelerate IPsec, I need to write a
> "Linux Scatterlist Crypto API" driver for my hardware accelerator and
> register it with the linux kernel.
> 
> What I would like to know is:
> 1) does the xfrm/ESP implementation support asynchronous/parallel packet
> operation?
> 2) If yes, does it support it in both directions (tx/rx)?
> 
> Our hardware supports a queue packets for processing and we would like
> to utilise that, to keep the hardware as busy as possible i.e. we would
> like to be able to send multiple packets to the hardware engine for
> encryption/hashing and then receive multiple acknowledgements that the
> packets are ready.
> 

Hi Dimitrios,

AFAK, the crypto interface is asynchronous but the hashing interface (as used 
in IPSec) is synchronous.

There are two patches I've recently seen on the list, one for converting to 
async hashing and one for parallel crypto/ipsec which will probably get in 
2.6.32.

However, I think that the best results for hw accel will be obtained if you 
accelerate the AEAD interface.

Speaking of hw accel, we are also playing with it and we got moderately good 
results. We are now running into two major software bottlenecks: memcpy 
(because of the copy required by TCP traffic) and CRC computation.

To solve the first issue we were thinking of extending the ESP implementation 
to create two scatter-gather lists / skbs, one which will be used as the 
source and once which will be used as the destination. This will allow to 
offload the memcpy operation to hardware.

We will soon start working on this - after a bit of stabilization we need to 
do and we will start pestering you crypto wizards with questions /  patches, 
but in the meanwhile, if you have any advice on this topic we will greatly 
appreciate it :)

Thanks!
tavi
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -v3] crypto: Add PCLMULQDQ accelerated GHASH implementation

2009-09-15 Thread Daniel Walker
On Tue, 2009-09-15 at 13:42 +0800, Huang Ying wrote:
> Hi, Herbert,
> 
> The dependency to irq_fpu_usable has been merged by linus' tree.
> 
> Best Regards,
> Huang Ying
> -->
> PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
> carry-less multiplication. More information about PCLMULQDQ can be
> found at:
> 
> http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
> 
> Because PCLMULQDQ changes XMM state, its usage must be enclosed with
> kernel_fpu_begin/end, which can be used only in process context, the
> acceleration is implemented as crypto_ahash. That is, request in soft
> IRQ context will be defered to the cryptd kernel thread.
> 
> v3:
>  - Revise GHASH implementation, performance increase about 2x.


You have three style issues in this patch, for instance you shouldn't
set values inside if() statements .. If you run checkpatch on this it
will tell you all the style problems. Could you run this through
checkpatch and fix any errors it's find ?

Daniel

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


ESP hardware acceleration

2009-09-15 Thread Dimitrios Siganos

Hi,

We are using linux-2.6.28 and we would like to hardware accelerate the 
NETKEY IPsec traffic. We are using strongswan for the upper layers.


I understand that strongswan uses the Linux/NETKEY IPsec implementation, 
which in turn, uses the Linux Scatterlist Crypto API for all its 
cryptographic work. To hardware accelerate IPsec, I need to write a 
"Linux Scatterlist Crypto API" driver for my hardware accelerator and 
register it with the linux kernel.


What I would like to know is:
1) does the xfrm/ESP implementation support asynchronous/parallel packet 
operation?

2) If yes, does it support it in both directions (tx/rx)?

Our hardware supports a queue packets for processing and we would like 
to utilise that, to keep the hardware as busy as possible i.e. we would 
like to be able to send multiple packets to the hardware engine for 
encryption/hashing and then receive multiple acknowledgements that the 
packets are ready.


Regards,
Dimitrios Siganos

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html