[PATCH 3/3] crypto: x86/chacha20 - Add a 4-block AVX-512VL variant

2018-11-20 Thread Martin Willi
This version uses the same principle as the AVX2 version by scheduling the
operations for two block pairs in parallel. It benefits from the AVX-512VL
rotate instructions and the more efficient partial block handling using
"vmovdqu8", resulting in a speedup of the raw block function of ~20%.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-avx512vl-x86_64.S | 272 +
 arch/x86/crypto/chacha20_glue.c|   7 +
 2 files changed, 279 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S 
b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
index 261097578715..55d34de29e3e 100644
--- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
@@ -12,6 +12,11 @@
 CTR2BL:.octa 0x
.octa 0x0001
 
+.section   .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:.octa 0x0002
+   .octa 0x0003
+
 .section   .rodata.cst32.CTR8BL, "aM", @progbits, 32
 .align 32
 CTR8BL:.octa 0x000300020001
@@ -185,6 +190,273 @@ ENTRY(chacha20_2block_xor_avx512vl)
 
 ENDPROC(chacha20_2block_xor_avx512vl)
 
+ENTRY(chacha20_4block_xor_avx512vl)
+   # %rdi: Input state matrix, s
+   # %rsi: up to 4 data blocks output, o
+   # %rdx: up to 4 data blocks input, i
+   # %rcx: input/output length in bytes
+
+   # This function encrypts four ChaCha20 block by loading the state
+   # matrix four times across eight AVX registers. It performs matrix
+   # operations on four words in two matrices in parallel, sequentially
+   # to the operations on the four words of the other two matrices. The
+   # required word shuffling has a rather high latency, we can do the
+   # arithmetic on two matrix-pairs without much slowdown.
+
+   vzeroupper
+
+   # x0..3[0-4] = s0..3
+   vbroadcasti128  0x00(%rdi),%ymm0
+   vbroadcasti128  0x10(%rdi),%ymm1
+   vbroadcasti128  0x20(%rdi),%ymm2
+   vbroadcasti128  0x30(%rdi),%ymm3
+
+   vmovdqa %ymm0,%ymm4
+   vmovdqa %ymm1,%ymm5
+   vmovdqa %ymm2,%ymm6
+   vmovdqa %ymm3,%ymm7
+
+   vpaddd  CTR2BL(%rip),%ymm3,%ymm3
+   vpaddd  CTR4BL(%rip),%ymm7,%ymm7
+
+   vmovdqa %ymm0,%ymm11
+   vmovdqa %ymm1,%ymm12
+   vmovdqa %ymm2,%ymm13
+   vmovdqa %ymm3,%ymm14
+   vmovdqa %ymm7,%ymm15
+
+   mov $10,%rax
+
+.Ldoubleround4:
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxord  %ymm0,%ymm3,%ymm3
+   vprold  $16,%ymm3,%ymm3
+
+   vpaddd  %ymm5,%ymm4,%ymm4
+   vpxord  %ymm4,%ymm7,%ymm7
+   vprold  $16,%ymm7,%ymm7
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxord  %ymm2,%ymm1,%ymm1
+   vprold  $12,%ymm1,%ymm1
+
+   vpaddd  %ymm7,%ymm6,%ymm6
+   vpxord  %ymm6,%ymm5,%ymm5
+   vprold  $12,%ymm5,%ymm5
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxord  %ymm0,%ymm3,%ymm3
+   vprold  $8,%ymm3,%ymm3
+
+   vpaddd  %ymm5,%ymm4,%ymm4
+   vpxord  %ymm4,%ymm7,%ymm7
+   vprold  $8,%ymm7,%ymm7
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxord  %ymm2,%ymm1,%ymm1
+   vprold  $7,%ymm1,%ymm1
+
+   vpaddd  %ymm7,%ymm6,%ymm6
+   vpxord  %ymm6,%ymm5,%ymm5
+   vprold  $7,%ymm5,%ymm5
+
+   # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+   vpshufd $0x39,%ymm1,%ymm1
+   vpshufd $0x39,%ymm5,%ymm5
+   # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+   vpshufd $0x4e,%ymm2,%ymm2
+   vpshufd $0x4e,%ymm6,%ymm6
+   # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+   vpshufd $0x93,%ymm3,%ymm3
+   vpshufd $0x93,%ymm7,%ymm7
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxord  %ymm0,%ymm3,%ymm3
+   vprold  $16,%ymm3,%ymm3
+
+   vpaddd  %ymm5,%ymm4,%ymm4
+   vpxord  %ymm4,%ymm7,%ymm7
+   vprold  $16,%ymm7,%ymm7
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxord  %ymm2,%ymm1,%ymm1
+   vprold  $12,%ymm1,%ymm1
+
+   vpaddd  %ymm7,%ymm6,%ymm6
+   vpxord  %ymm6,%ymm5,%ymm5
+   vprold  $12,%ymm5,%ymm5
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxord  %ymm0,%ymm3,%ymm3
+   vprold  $8,%ymm3,%ymm3
+
+

[PATCH 0/3] crypto: x86/chacha20 - AVX-512VL block functions

2018-11-20 Thread Martin Willi
1032   1717   2175   1470   1995
1040   1768   2186   1456   1983
1048   1704   2185   1451   1950
1056   1770   2176   1410   1927
1064   1710   2178   1418   1918
1072   1753   2168   1394   1892
1080   1696   2170   1400   1892
1088   1761   2174   1472   2014
1096   1681   2158   1464   1968
1104   1746   2172   1457   1978
1112   1689   2167   1445   1955
1120   1738   2160   1431   1919
1128   1689   2155   1428   1915
1136   1747   2169   1415   1899
1144   1678   2161   1403   1881
1152   1749   2159   1474   2007
1160   1601   2050   1470   1991
1168   1648   2057   1461   1969
1176   1605   2043   1439   1948
1184   1654   2057   1428   1926
1192   1595   2051   1427   1899
1200   1647   2036   1419   1902
1208   1598   2048   1402   1888
1216   1643   2053   1471   1991
1224   1595   2043   1469   1987
1232   1649   2048   1456   1971
1240   1599   2040   1436   1939
1248   1644   2042   1433   1918
1256   1602   2045   1424   1900
1264   1648   2048   1413   1878
1272   1591   2034   1401   1878
1280   1649   2044   1475   2002
1288   1493   1984   1461   1972
1296   1484   1971   1438   1962
1304   1490   1985   1443   1947
1312   1535   1987   1425   1913
1320   1481   1965   1410   1901
1328   1493   1984   1407   1900
1336   1493   1979   1396   1882
1344   1526   1980   1465   1988
1352   1492   1970   1463   1983
1360   1487   1974   1452   1966
1368   1481   1977   1439   1937
1376   1535   1970   1428   1915
1384   1489   1973   1417   1905
1392   1483   1974   1415   1881
1400   1485   1963   1403   1882
1408   1523   1976   1466   1988
1416   1477   1969   1459   1964
1424   1487   1975   1455   1966
1432   1488   1972   1438   1941
1440   1518   1958   1432   1908
1448   1484   1972   1421   1905
1456   1485   1973   1398   1888
1464   1476   1962   1399   1870
1472   1530   1975   1471   1998
1480   1478   1967   1452   1979
1488   1478   1963   1453   1947
1496   1477   1963   1438   1930


Martin Willi (3):
  crypto: x86/chacha20 - Add a 8-block AVX-512VL variant
  crypto: x86/chacha20 - Add a 2-block AVX-512VL variant
  crypto: x86/chacha20 - Add a 4-block AVX-512VL variant

 arch/x86/crypto/Makefile   |   5 +
 arch/x86/crypto/chacha20-avx512vl-x86_64.S | 839 +
 arch/x86/crypto/chacha20_glue.c|  40 +
 3 files changed, 884 insertions(+)
 create mode 100644 arch/x86/crypto/chacha20-avx512vl-x86_64.S

-- 
2.17.1



[PATCH 2/3] crypto: x86/chacha20 - Add a 2-block AVX-512VL variant

2018-11-20 Thread Martin Willi
This version uses the same principle as the AVX2 version. It benefits
from the AVX-512VL rotate instructions and the more efficient partial
block handling using "vmovdqu8", resulting in a speedup of ~20%.

Unlike the AVX2 version, it is faster than the single block SSSE3 version
to process a single block. Hence we engage that function for (partial)
single block lengths as well.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-avx512vl-x86_64.S | 171 +
 arch/x86/crypto/chacha20_glue.c|   7 +
 2 files changed, 178 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S 
b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
index e1877afcaa73..261097578715 100644
--- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
@@ -7,6 +7,11 @@
 
 #include 
 
+.section   .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:.octa 0x
+   .octa 0x0001
+
 .section   .rodata.cst32.CTR8BL, "aM", @progbits, 32
 .align 32
 CTR8BL:.octa 0x000300020001
@@ -14,6 +19,172 @@ CTR8BL: .octa 0x000300020001
 
 .text
 
+ENTRY(chacha20_2block_xor_avx512vl)
+   # %rdi: Input state matrix, s
+   # %rsi: up to 2 data blocks output, o
+   # %rdx: up to 2 data blocks input, i
+   # %rcx: input/output length in bytes
+
+   # This function encrypts two ChaCha20 blocks by loading the state
+   # matrix twice across four AVX registers. It performs matrix operations
+   # on four words in each matrix in parallel, but requires shuffling to
+   # rearrange the words after each round.
+
+   vzeroupper
+
+   # x0..3[0-2] = s0..3
+   vbroadcasti128  0x00(%rdi),%ymm0
+   vbroadcasti128  0x10(%rdi),%ymm1
+   vbroadcasti128  0x20(%rdi),%ymm2
+   vbroadcasti128  0x30(%rdi),%ymm3
+
+   vpaddd  CTR2BL(%rip),%ymm3,%ymm3
+
+   vmovdqa %ymm0,%ymm8
+   vmovdqa %ymm1,%ymm9
+   vmovdqa %ymm2,%ymm10
+   vmovdqa %ymm3,%ymm11
+
+   mov $10,%rax
+
+.Ldoubleround:
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxord  %ymm0,%ymm3,%ymm3
+   vprold  $16,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxord  %ymm2,%ymm1,%ymm1
+   vprold  $12,%ymm1,%ymm1
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxord  %ymm0,%ymm3,%ymm3
+   vprold  $8,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxord  %ymm2,%ymm1,%ymm1
+   vprold  $7,%ymm1,%ymm1
+
+   # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+   vpshufd $0x39,%ymm1,%ymm1
+   # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+   vpshufd $0x4e,%ymm2,%ymm2
+   # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+   vpshufd $0x93,%ymm3,%ymm3
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxord  %ymm0,%ymm3,%ymm3
+   vprold  $16,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxord  %ymm2,%ymm1,%ymm1
+   vprold  $12,%ymm1,%ymm1
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxord  %ymm0,%ymm3,%ymm3
+   vprold  $8,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxord  %ymm2,%ymm1,%ymm1
+   vprold  $7,%ymm1,%ymm1
+
+   # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+   vpshufd $0x93,%ymm1,%ymm1
+   # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+   vpshufd $0x4e,%ymm2,%ymm2
+   # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+   vpshufd $0x39,%ymm3,%ymm3
+
+   dec %rax
+   jnz .Ldoubleround
+
+   # o0 = i0 ^ (x0 + s0)
+   vpaddd  %ymm8,%ymm0,%ymm7
+   cmp $0x10,%rcx
+   jl  .Lxorpart2
+   vpxord  0x00(%rdx),%xmm7,%xmm6
+   vmovdqu %xmm6,0x00(%rsi)
+   vextracti128$1,%ymm7,%xmm0
+   # o1 = i1 ^ (x1 + s1)
+   vpaddd  %ymm9,%ymm1,%ymm7
+   cmp $0x20,%rcx
+   jl  .Lxorpart2
+   vpxord  0x10(%rdx),%xmm7,%xmm6
+   vmovdqu %xmm6,0x10(%rsi)
+   vextracti128$1,%ymm7,%xmm1
+   # o2 = i2 ^ (x2 + s2)
+   vpaddd  %ymm10,%ymm2,%ymm7
+   cmp $0x30,%rcx
+   jl  .Lxorpart2
+   vpxord  0x20(%rdx),%xmm7,%xmm6
+   vmovdqu %xmm6,0x20(%rsi)
+   vextracti128$1,%ymm7

[PATCH 1/3] crypto: x86/chacha20 - Add a 8-block AVX-512VL variant

2018-11-20 Thread Martin Willi
This variant is similar to the AVX2 version, but benefits from the AVX-512
rotate instructions and the additional registers, so it can operate without
any data on the stack. It uses ymm registers only to avoid the massive core
throttling on Skylake-X platforms. Nontheless does it bring a ~30% speed
improvement compared to the AVX2 variant for random encryption lengths.

The AVX2 version uses "rep movsb" for partial block XORing via the stack.
With AVX-512, the new "vmovdqu8" can do this much more efficiently. The
associated "kmov" instructions to work with dynamic masks is not part of
the AVX-512VL instruction set, hence we depend on AVX-512BW as well. Given
that the major AVX-512VL architectures provide AVX-512BW and this extension
does not affect core clocking, this seems to be no problem at least for
now.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/Makefile   |   5 +
 arch/x86/crypto/chacha20-avx512vl-x86_64.S | 396 +
 arch/x86/crypto/chacha20_glue.c|  26 ++
 3 files changed, 427 insertions(+)
 create mode 100644 arch/x86/crypto/chacha20-avx512vl-x86_64.S

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a4b0007a54e1..ce4e43642984 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y
 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
 avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
+avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
 sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
 sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
 
@@ -103,6 +104,10 @@ ifeq ($(avx2_supported),yes)
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
 endif
 
+ifeq ($(avx512_supported),yes)
+   chacha20-x86_64-y += chacha20-avx512vl-x86_64.o
+endif
+
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S 
b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
new file mode 100644
index ..e1877afcaa73
--- /dev/null
+++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
@@ -0,0 +1,396 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include 
+
+.section   .rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL:.octa 0x000300020001
+   .octa 0x0007000600050004
+
+.text
+
+ENTRY(chacha20_8block_xor_avx512vl)
+   # %rdi: Input state matrix, s
+   # %rsi: up to 8 data blocks output, o
+   # %rdx: up to 8 data blocks input, i
+   # %rcx: input/output length in bytes
+
+   # This function encrypts eight consecutive ChaCha20 blocks by loading
+   # the state matrix in AVX registers eight times. Compared to AVX2, this
+   # mostly benefits from the new rotate instructions in VL and the
+   # additional registers.
+
+   vzeroupper
+
+   # x0..15[0-7] = s[0..15]
+   vpbroadcastd0x00(%rdi),%ymm0
+   vpbroadcastd0x04(%rdi),%ymm1
+   vpbroadcastd0x08(%rdi),%ymm2
+   vpbroadcastd0x0c(%rdi),%ymm3
+   vpbroadcastd0x10(%rdi),%ymm4
+   vpbroadcastd0x14(%rdi),%ymm5
+   vpbroadcastd0x18(%rdi),%ymm6
+   vpbroadcastd0x1c(%rdi),%ymm7
+   vpbroadcastd0x20(%rdi),%ymm8
+   vpbroadcastd0x24(%rdi),%ymm9
+   vpbroadcastd0x28(%rdi),%ymm10
+   vpbroadcastd0x2c(%rdi),%ymm11
+   vpbroadcastd0x30(%rdi),%ymm12
+   vpbroadcastd0x34(%rdi),%ymm13
+   vpbroadcastd0x38(%rdi),%ymm14
+   vpbroadcastd0x3c(%rdi),%ymm15
+
+   # x12 += counter values 0-3
+   vpaddd  CTR8BL(%rip),%ymm12,%ymm12
+
+   vmovdqa64   %ymm0,%ymm16
+   vmovdqa64   %ymm1,%ymm17
+   vmovdqa64   %ymm2,%ymm18
+   vmovdqa64   %ymm3,%ymm19
+   vmovdqa64   %ymm4,%ymm20
+   vmovdqa64   %ymm5,%ymm21
+   vmovdqa64   %ymm6,%ymm22
+   vmovdqa64   %ymm7,%ymm23
+   vmovdqa64   %ymm8,%ymm24
+   vmovdqa64   %ymm9,%ymm25
+   vmovdqa64   %ymm10,%ymm26
+   vmovdqa64   %ymm11,%ymm27
+   vmovdqa64   %ymm12,%ymm28
+   vmovdqa64   %ymm13,%ymm29
+   vmovdqa64   %ymm14,%ymm30
+   vmovdqa64   %ymm15,%ymm31
+
+   mov $10,%eax
+
+.Ldoubleround8:
+   # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+   vpaddd  %ymm0,%ymm4,%ymm0
+   vpxord  %ymm0,%ymm12,%ymm12
+   vprold  $16,%ymm12,%ymm12
+

Re: [PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

2018-11-20 Thread Martin Willi
Hi Jason,

> [...] I have a massive Xeon Gold 5120 machine that I can give you
> access to if you'd like to do some testing and benching.

Thanks for the offer, no need at this time. But I certainly would
welcome if you could do some (Wireguard) benching with that code to see
if it works for you.

> Actually, similarly here, a 10nm Cannon Lake machine should be
> arriving at my house this week, which should make for some
> interesting testing ground for non-throttled zmm, if you'd like to
> play with it.

Maybe in a future iteration, thanks. In fact would it be interesting to
know if Cannon Lake can handle that throttling better.

Regards
Martin



Re: [PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

2018-11-18 Thread Martin Willi
Hi Jason,

> I'd be inclined to roll with your implementation if it can eventually
> become competitive with Andy Polyakov's, [...]

I think for the SSSE3/AVX2 code paths it is competitive; especially for
small sizes it is faster, which is not that unimportant when
implementing layer 3 VPNs.

> there are still no AVX-512 paths, which means it's considerably
> slower on all newer generation Intel chips. Andy's has the AVX-512VL
> implementation for Skylake (using ymm, so as not to hit throttling)
> and AVX-512F for Cannon Lake and beyond (using zmm).

I don't think that having AVX-512F is that important until it is really
usable on CPUs in the market.

Adding AVX-512VL support is relatively simple. I have a patchset mostly
ready that is more than competitive with the code from Zinc. I'll clean
that up and do more testing before posting it later this week.

Best regards
Martin



[PATCH 6/6] crypto: x86/chacha20 - Add a 4-block AVX2 variant

2018-11-11 Thread Martin Willi
This variant builds upon the idea of the 2-block AVX2 variant that
shuffles words after each round. The shuffling has a rather high latency,
so the arithmetic units are not optimally used.

Given that we have plenty of registers in AVX, this version parallelizes
the 2-block variant to do four blocks. While the first two blocks are
shuffling, the CPU can do the XORing on the second two blocks and
vice-versa, which makes this version much faster than the SSSE3 variant
for four blocks. The latter is now mostly for systems that do not have
AVX2, but there it is the work-horse, so we keep it in place.

The partial XORing function trailer is very similar to the AVX2 2-block
variant. While it could be shared, that code segment is rather short;
profiling is also easier with the trailer integrated, so we keep it per
function.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-avx2-x86_64.S | 310 +
 arch/x86/crypto/chacha20_glue.c|   7 +
 2 files changed, 317 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S 
b/arch/x86/crypto/chacha20-avx2-x86_64.S
index 8247076b0ba7..b6ab082be657 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -31,6 +31,11 @@ CTRINC:  .octa 0x000300020001
 CTR2BL:.octa 0x
.octa 0x0001
 
+.section   .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:.octa 0x0002
+   .octa 0x0003
+
 .text
 
 ENTRY(chacha20_2block_xor_avx2)
@@ -225,6 +230,311 @@ ENTRY(chacha20_2block_xor_avx2)
 
 ENDPROC(chacha20_2block_xor_avx2)
 
+ENTRY(chacha20_4block_xor_avx2)
+   # %rdi: Input state matrix, s
+   # %rsi: up to 4 data blocks output, o
+   # %rdx: up to 4 data blocks input, i
+   # %rcx: input/output length in bytes
+
+   # This function encrypts four ChaCha20 block by loading the state
+   # matrix four times across eight AVX registers. It performs matrix
+   # operations on four words in two matrices in parallel, sequentially
+   # to the operations on the four words of the other two matrices. The
+   # required word shuffling has a rather high latency, we can do the
+   # arithmetic on two matrix-pairs without much slowdown.
+
+   vzeroupper
+
+   # x0..3[0-4] = s0..3
+   vbroadcasti128  0x00(%rdi),%ymm0
+   vbroadcasti128  0x10(%rdi),%ymm1
+   vbroadcasti128  0x20(%rdi),%ymm2
+   vbroadcasti128  0x30(%rdi),%ymm3
+
+   vmovdqa %ymm0,%ymm4
+   vmovdqa %ymm1,%ymm5
+   vmovdqa %ymm2,%ymm6
+   vmovdqa %ymm3,%ymm7
+
+   vpaddd  CTR2BL(%rip),%ymm3,%ymm3
+   vpaddd  CTR4BL(%rip),%ymm7,%ymm7
+
+   vmovdqa %ymm0,%ymm11
+   vmovdqa %ymm1,%ymm12
+   vmovdqa %ymm2,%ymm13
+   vmovdqa %ymm3,%ymm14
+   vmovdqa %ymm7,%ymm15
+
+   vmovdqa ROT8(%rip),%ymm8
+   vmovdqa ROT16(%rip),%ymm9
+
+   mov %rcx,%rax
+   mov $10,%ecx
+
+.Ldoubleround4:
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm9,%ymm3,%ymm3
+
+   vpaddd  %ymm5,%ymm4,%ymm4
+   vpxor   %ymm4,%ymm7,%ymm7
+   vpshufb %ymm9,%ymm7,%ymm7
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm10
+   vpslld  $12,%ymm10,%ymm10
+   vpsrld  $20,%ymm1,%ymm1
+   vpor%ymm10,%ymm1,%ymm1
+
+   vpaddd  %ymm7,%ymm6,%ymm6
+   vpxor   %ymm6,%ymm5,%ymm5
+   vmovdqa %ymm5,%ymm10
+   vpslld  $12,%ymm10,%ymm10
+   vpsrld  $20,%ymm5,%ymm5
+   vpor%ymm10,%ymm5,%ymm5
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm8,%ymm3,%ymm3
+
+   vpaddd  %ymm5,%ymm4,%ymm4
+   vpxor   %ymm4,%ymm7,%ymm7
+   vpshufb %ymm8,%ymm7,%ymm7
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm10
+   vpslld  $7,%ymm10,%ymm10
+   vpsrld  $25,%ymm1,%ymm1
+   vpor%ymm10,%ymm1,%ymm1
+
+   vpaddd  %ymm7,%ymm6,%ymm6
+   vpxor   %ymm6,%ymm5,%ymm5
+   vmovdqa %ymm5,%ymm10
+   vpslld  $7,%ymm10,%ymm10
+   vpsrld  $25,%ymm5,%ymm5
+   vpor%ymm10,%ymm5,%ymm5
+
+   # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+   vpshufd $0x39,%ymm1,%ymm1
+

[PATCH 3/6] crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant

2018-11-11 Thread Martin Willi
Add a length argument to the eight block function for AVX2, so the
block function may XOR only a partial length of eight blocks.

To avoid unnecessary operations, we integrate XORing of the first four
blocks in the final lane interleaving; this also avoids some work in
the partial lengths path.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-avx2-x86_64.S | 189 +
 arch/x86/crypto/chacha20_glue.c|   5 +-
 2 files changed, 133 insertions(+), 61 deletions(-)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S 
b/arch/x86/crypto/chacha20-avx2-x86_64.S
index f3cd26f48332..7b62d55bee3d 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -30,8 +30,9 @@ CTRINC:   .octa 0x000300020001
 
 ENTRY(chacha20_8block_xor_avx2)
# %rdi: Input state matrix, s
-   # %rsi: 8 data blocks output, o
-   # %rdx: 8 data blocks input, i
+   # %rsi: up to 8 data blocks output, o
+   # %rdx: up to 8 data blocks input, i
+   # %rcx: input/output length in bytes
 
# This function encrypts eight consecutive ChaCha20 blocks by loading
# the state matrix in AVX registers eight times. As we need some
@@ -48,6 +49,7 @@ ENTRY(chacha20_8block_xor_avx2)
lea 8(%rsp),%r10
and $~31, %rsp
sub $0x80, %rsp
+   mov %rcx,%rax
 
# x0..15[0-7] = s[0..15]
vpbroadcastd0x00(%rdi),%ymm0
@@ -375,74 +377,143 @@ ENTRY(chacha20_8block_xor_avx2)
vpunpckhqdq %ymm15,%ymm0,%ymm15
 
# interleave 128-bit words in state n, n+4
-   vmovdqa 0x00(%rsp),%ymm0
-   vperm2i128  $0x20,%ymm4,%ymm0,%ymm1
-   vperm2i128  $0x31,%ymm4,%ymm0,%ymm4
-   vmovdqa %ymm1,0x00(%rsp)
-   vmovdqa 0x20(%rsp),%ymm0
-   vperm2i128  $0x20,%ymm5,%ymm0,%ymm1
-   vperm2i128  $0x31,%ymm5,%ymm0,%ymm5
-   vmovdqa %ymm1,0x20(%rsp)
-   vmovdqa 0x40(%rsp),%ymm0
-   vperm2i128  $0x20,%ymm6,%ymm0,%ymm1
-   vperm2i128  $0x31,%ymm6,%ymm0,%ymm6
-   vmovdqa %ymm1,0x40(%rsp)
-   vmovdqa 0x60(%rsp),%ymm0
-   vperm2i128  $0x20,%ymm7,%ymm0,%ymm1
-   vperm2i128  $0x31,%ymm7,%ymm0,%ymm7
-   vmovdqa %ymm1,0x60(%rsp)
+   # xor/write first four blocks
+   vmovdqa 0x00(%rsp),%ymm1
+   vperm2i128  $0x20,%ymm4,%ymm1,%ymm0
+   cmp $0x0020,%rax
+   jl  .Lxorpart8
+   vpxor   0x(%rdx),%ymm0,%ymm0
+   vmovdqu %ymm0,0x(%rsi)
+   vperm2i128  $0x31,%ymm4,%ymm1,%ymm4
+
vperm2i128  $0x20,%ymm12,%ymm8,%ymm0
+   cmp $0x0040,%rax
+   jl  .Lxorpart8
+   vpxor   0x0020(%rdx),%ymm0,%ymm0
+   vmovdqu %ymm0,0x0020(%rsi)
vperm2i128  $0x31,%ymm12,%ymm8,%ymm12
-   vmovdqa %ymm0,%ymm8
-   vperm2i128  $0x20,%ymm13,%ymm9,%ymm0
-   vperm2i128  $0x31,%ymm13,%ymm9,%ymm13
-   vmovdqa %ymm0,%ymm9
+
+   vmovdqa 0x40(%rsp),%ymm1
+   vperm2i128  $0x20,%ymm6,%ymm1,%ymm0
+   cmp $0x0060,%rax
+   jl  .Lxorpart8
+   vpxor   0x0040(%rdx),%ymm0,%ymm0
+   vmovdqu %ymm0,0x0040(%rsi)
+   vperm2i128  $0x31,%ymm6,%ymm1,%ymm6
+
vperm2i128  $0x20,%ymm14,%ymm10,%ymm0
+   cmp $0x0080,%rax
+   jl  .Lxorpart8
+   vpxor   0x0060(%rdx),%ymm0,%ymm0
+   vmovdqu %ymm0,0x0060(%rsi)
vperm2i128  $0x31,%ymm14,%ymm10,%ymm14
-   vmovdqa %ymm0,%ymm10
-   vperm2i128  $0x20,%ymm15,%ymm11,%ymm0
-   vperm2i128  $0x31,%ymm15,%ymm11,%ymm15
-   vmovdqa %ymm0,%ymm11
 
-   # xor with corresponding input, write to output
-   vmovdqa 0x00(%rsp),%ymm0
-   vpxor   0x(%rdx),%ymm0,%ymm0
-   vmovdqu %ymm0,0x(%rsi)
-   vmovdqa 0x20(%rsp),%ymm0
+   vmovdqa 0x20(%rsp),%ymm1
+   vperm2i128  $0x20,%ymm5,%ymm1,%ymm0
+   cmp $0x00a0,%rax
+   jl  .Lxorpart8
vpxor   0x0080(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0080(%rsi)
-   vmovdqa 0x40(%rsp),%ymm0
-   vpxor   0x0040(%rdx),%ymm0,%ymm0
-   vmovdqu %ymm0,0x0040(%rsi)
-   vmovdqa 0x60(%rsp),%ymm0
+   vperm2i128  $0x31,%ymm5,%ymm1,%ymm5
+
+   vperm2i128  $0x20,%ymm13,%ymm9,%ymm0
+   cmp $0x00c0,%rax
+   jl  .Lxorpart8
+   vpxor   0x00a0(%rdx),%ymm0,%ymm0
+   vmovdqu %ymm0,0x00a0(%rsi)
+   vperm2i128  $0x31,%ymm13,%ymm9,%ymm13
+
+   vmovdqa 0x60(%rsp),%ymm1
+   vperm2i128  $0x20,%ymm7,%ymm1,%ymm0
+   cmp $0x00e0,%rax

[PATCH 4/6] crypto: x86/chacha20 - Use larger block functions more aggressively

2018-11-11 Thread Martin Willi
Now that all block functions support partial lengths, engage the wider
block sizes more aggressively. This prevents using smaller block
functions multiple times, where the next larger block function would
have been faster.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20_glue.c | 39 -
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 882e8bf5965a..b541da71f11e 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -29,6 +29,12 @@ asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 
*dst, const u8 *src,
 static bool chacha20_use_avx2;
 #endif
 
+static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks)
+{
+   len = min(len, maxblocks * CHACHA20_BLOCK_SIZE);
+   return round_up(len, CHACHA20_BLOCK_SIZE) / CHACHA20_BLOCK_SIZE;
+}
+
 static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
 {
@@ -41,6 +47,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 
*src,
dst += CHACHA20_BLOCK_SIZE * 8;
state[12] += 8;
}
+   if (bytes > CHACHA20_BLOCK_SIZE * 4) {
+   chacha20_8block_xor_avx2(state, dst, src, bytes);
+   state[12] += chacha20_advance(bytes, 8);
+   return;
+   }
}
 #endif
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
@@ -50,15 +61,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 
*src,
dst += CHACHA20_BLOCK_SIZE * 4;
state[12] += 4;
}
-   while (bytes >= CHACHA20_BLOCK_SIZE) {
-   chacha20_block_xor_ssse3(state, dst, src, bytes);
-   bytes -= CHACHA20_BLOCK_SIZE;
-   src += CHACHA20_BLOCK_SIZE;
-   dst += CHACHA20_BLOCK_SIZE;
-   state[12]++;
+   if (bytes > CHACHA20_BLOCK_SIZE) {
+   chacha20_4block_xor_ssse3(state, dst, src, bytes);
+   state[12] += chacha20_advance(bytes, 4);
+   return;
}
if (bytes) {
chacha20_block_xor_ssse3(state, dst, src, bytes);
+   state[12]++;
}
 }
 
@@ -82,17 +92,16 @@ static int chacha20_simd(struct skcipher_request *req)
 
kernel_fpu_begin();
 
-   while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
-   chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
-   rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
-   err = skcipher_walk_done(,
-walk.nbytes % CHACHA20_BLOCK_SIZE);
-   }
+   while (walk.nbytes > 0) {
+   unsigned int nbytes = walk.nbytes;
+
+   if (nbytes < walk.total)
+   nbytes = round_down(nbytes, walk.stride);
 
-   if (walk.nbytes) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
-   walk.nbytes);
-   err = skcipher_walk_done(, 0);
+   nbytes);
+
+   err = skcipher_walk_done(, walk.nbytes - nbytes);
}
 
kernel_fpu_end();
-- 
2.17.1



[PATCH 1/6] crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant

2018-11-11 Thread Martin Willi
Add a length argument to the single block function for SSSE3, so the
block function may XOR only a partial length of the full block. Given
that the setup code is rather cheap, the function does not process more
than one block; this allows us to keep the block function selection in
the C glue code.

The required branching does not negatively affect performance for full
block sizes. The partial XORing uses simple "rep movsb" to copy the
data before and after doing XOR in SSE. This is rather efficient on
modern processors; movsw can be slightly faster, but the additional
complexity is probably not worth it.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 74 -
 arch/x86/crypto/chacha20_glue.c | 11 ++--
 2 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S 
b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 512a2b500fd1..98d130b5e4ab 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -25,12 +25,13 @@ CTRINC: .octa 0x000300020001
 
 ENTRY(chacha20_block_xor_ssse3)
# %rdi: Input state matrix, s
-   # %rsi: 1 data block output, o
-   # %rdx: 1 data block input, i
+   # %rsi: up to 1 data block output, o
+   # %rdx: up to 1 data block input, i
+   # %rcx: input/output length in bytes
 
# This function encrypts one ChaCha20 block by loading the state matrix
# in four SSE registers. It performs matrix operation on four words in
-   # parallel, but requireds shuffling to rearrange the words after each
+   # parallel, but requires shuffling to rearrange the words after each
# round. 8/16-bit word rotation is done with the slightly better
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
# traditional shift+OR.
@@ -48,7 +49,8 @@ ENTRY(chacha20_block_xor_ssse3)
movdqa  ROT8(%rip),%xmm4
movdqa  ROT16(%rip),%xmm5
 
-   mov $10,%ecx
+   mov %rcx,%rax
+   mov $10,%ecx
 
 .Ldoubleround:
 
@@ -122,27 +124,69 @@ ENTRY(chacha20_block_xor_ssse3)
jnz .Ldoubleround
 
# o0 = i0 ^ (x0 + s0)
-   movdqu  0x00(%rdx),%xmm4
paddd   %xmm8,%xmm0
+   cmp $0x10,%rax
+   jl  .Lxorpart
+   movdqu  0x00(%rdx),%xmm4
pxor%xmm4,%xmm0
movdqu  %xmm0,0x00(%rsi)
# o1 = i1 ^ (x1 + s1)
-   movdqu  0x10(%rdx),%xmm5
paddd   %xmm9,%xmm1
-   pxor%xmm5,%xmm1
-   movdqu  %xmm1,0x10(%rsi)
+   movdqa  %xmm1,%xmm0
+   cmp $0x20,%rax
+   jl  .Lxorpart
+   movdqu  0x10(%rdx),%xmm0
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x10(%rsi)
# o2 = i2 ^ (x2 + s2)
-   movdqu  0x20(%rdx),%xmm6
paddd   %xmm10,%xmm2
-   pxor%xmm6,%xmm2
-   movdqu  %xmm2,0x20(%rsi)
+   movdqa  %xmm2,%xmm0
+   cmp $0x30,%rax
+   jl  .Lxorpart
+   movdqu  0x20(%rdx),%xmm0
+   pxor%xmm2,%xmm0
+   movdqu  %xmm0,0x20(%rsi)
# o3 = i3 ^ (x3 + s3)
-   movdqu  0x30(%rdx),%xmm7
paddd   %xmm11,%xmm3
-   pxor%xmm7,%xmm3
-   movdqu  %xmm3,0x30(%rsi)
-
+   movdqa  %xmm3,%xmm0
+   cmp $0x40,%rax
+   jl  .Lxorpart
+   movdqu  0x30(%rdx),%xmm0
+   pxor%xmm3,%xmm0
+   movdqu  %xmm0,0x30(%rsi)
+
+.Ldone:
ret
+
+.Lxorpart:
+   # xor remaining bytes from partial register into output
+   mov %rax,%r9
+   and $0x0f,%r9
+   jz  .Ldone
+   and $~0x0f,%rax
+
+   mov %rsi,%r11
+
+   lea 8(%rsp),%r10
+   sub $0x10,%rsp
+   and $~31,%rsp
+
+   lea (%rdx,%rax),%rsi
+   mov %rsp,%rdi
+   mov %r9,%rcx
+   rep movsb
+
+   pxor0x00(%rsp),%xmm0
+   movdqa  %xmm0,0x00(%rsp)
+
+   mov %rsp,%rsi
+   lea (%r11,%rax),%rdi
+   mov %r9,%rcx
+   rep movsb
+
+   lea -8(%r10),%rsp
+   jmp .Ldone
+
 ENDPROC(chacha20_block_xor_ssse3)
 
 ENTRY(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index dce7c5d39c2f..cc4571736ce8 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -19,7 +19,8 @@
 
 #define CHACHA20_STATE_ALIGN 16
 
-asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_block_xor

[PATCH 2/6] crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant

2018-11-11 Thread Martin Willi
Add a length argument to the quad block function for SSSE3, so the
block function may XOR only a partial length of four blocks.

As we already have the stack set up, the partial XORing does not need
to. This gives a slightly different function trailer, so we keep that
separate from the 1-block function.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 163 ++--
 arch/x86/crypto/chacha20_glue.c |   5 +-
 2 files changed, 128 insertions(+), 40 deletions(-)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S 
b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 98d130b5e4ab..d8ac75bb448f 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -191,8 +191,9 @@ ENDPROC(chacha20_block_xor_ssse3)
 
 ENTRY(chacha20_4block_xor_ssse3)
# %rdi: Input state matrix, s
-   # %rsi: 4 data blocks output, o
-   # %rdx: 4 data blocks input, i
+   # %rsi: up to 4 data blocks output, o
+   # %rdx: up to 4 data blocks input, i
+   # %rcx: input/output length in bytes
 
# This function encrypts four consecutive ChaCha20 blocks by loading the
# the state matrix in SSE registers four times. As we need some scratch
@@ -207,6 +208,7 @@ ENTRY(chacha20_4block_xor_ssse3)
lea 8(%rsp),%r10
sub $0x80,%rsp
and $~63,%rsp
+   mov %rcx,%rax
 
# x0..15[0-3] = s0..3[0..3]
movq0x00(%rdi),%xmm1
@@ -617,58 +619,143 @@ ENTRY(chacha20_4block_xor_ssse3)
 
# xor with corresponding input, write to output
movdqa  0x00(%rsp),%xmm0
+   cmp $0x10,%rax
+   jl  .Lxorpart4
movdqu  0x00(%rdx),%xmm1
pxor%xmm1,%xmm0
movdqu  %xmm0,0x00(%rsi)
-   movdqa  0x10(%rsp),%xmm0
-   movdqu  0x80(%rdx),%xmm1
+
+   movdqu  %xmm4,%xmm0
+   cmp $0x20,%rax
+   jl  .Lxorpart4
+   movdqu  0x10(%rdx),%xmm1
pxor%xmm1,%xmm0
-   movdqu  %xmm0,0x80(%rsi)
+   movdqu  %xmm0,0x10(%rsi)
+
+   movdqu  %xmm8,%xmm0
+   cmp $0x30,%rax
+   jl  .Lxorpart4
+   movdqu  0x20(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x20(%rsi)
+
+   movdqu  %xmm12,%xmm0
+   cmp $0x40,%rax
+   jl  .Lxorpart4
+   movdqu  0x30(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x30(%rsi)
+
movdqa  0x20(%rsp),%xmm0
+   cmp $0x50,%rax
+   jl  .Lxorpart4
movdqu  0x40(%rdx),%xmm1
pxor%xmm1,%xmm0
movdqu  %xmm0,0x40(%rsi)
+
+   movdqu  %xmm6,%xmm0
+   cmp $0x60,%rax
+   jl  .Lxorpart4
+   movdqu  0x50(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x50(%rsi)
+
+   movdqu  %xmm10,%xmm0
+   cmp $0x70,%rax
+   jl  .Lxorpart4
+   movdqu  0x60(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x60(%rsi)
+
+   movdqu  %xmm14,%xmm0
+   cmp $0x80,%rax
+   jl  .Lxorpart4
+   movdqu  0x70(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x70(%rsi)
+
+   movdqa  0x10(%rsp),%xmm0
+   cmp $0x90,%rax
+   jl  .Lxorpart4
+   movdqu  0x80(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x80(%rsi)
+
+   movdqu  %xmm5,%xmm0
+   cmp $0xa0,%rax
+   jl  .Lxorpart4
+   movdqu  0x90(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x90(%rsi)
+
+   movdqu  %xmm9,%xmm0
+   cmp $0xb0,%rax
+   jl  .Lxorpart4
+   movdqu  0xa0(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0xa0(%rsi)
+
+   movdqu  %xmm13,%xmm0
+   cmp $0xc0,%rax
+   jl  .Lxorpart4
+   movdqu  0xb0(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0xb0(%rsi)
+
movdqa  0x30(%rsp),%xmm0
+   cmp $0xd0,%rax
+   jl  .Lxorpart4
movdqu  0xc0(%rdx),%xmm1
pxor%xmm1,%xmm0
movdqu  %xmm0,0xc0(%rsi)
-   movdqu  0x10(%rdx),%xmm1
-   pxor%xmm1,%xmm4
-   movdqu  %xmm4,0x10(%rsi)
-   movdqu  0x90(%rdx),%xmm1
-   pxor%xmm1,%xmm5
-   movdqu  %xmm5,0x90(%rsi)
-   movdqu  0x50(%rdx),%xmm1

[PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

2018-11-11 Thread Martin Willi
 1149 1523 1504
1400 1148 1517 1480
1408 1167 1561 1589
1416 1030 1516 1558
1424 1028 1516 1546
1432 1027 1522 1537
1440 1027 1564 1523
1448 1026 1507 1512
1456 1025 1515 1491
1464 1023 1522 1481
1472 1037 1559 1577
1480  927 1518 1559
1488  926 1514 1548
1496  926 1513 1534


Martin Willi (6):
  crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3
variant
  crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3
variant
  crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant
  crypto: x86/chacha20 - Use larger block functions more aggressively
  crypto: x86/chacha20 - Add a 2-block AVX2 variant
  crypto: x86/chacha20 - Add a 4-block AVX2 variant

 arch/x86/crypto/chacha20-avx2-x86_64.S  | 696 ++--
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 237 ++--
 arch/x86/crypto/chacha20_glue.c |  72 ++-
 3 files changed, 868 insertions(+), 137 deletions(-)

-- 
2.17.1



[PATCH 5/6] crypto: x86/chacha20 - Add a 2-block AVX2 variant

2018-11-11 Thread Martin Willi
This variant uses the same principle as the single block SSSE3 variant
by shuffling the state matrix after each round. With the wider AVX
registers, we can do two blocks in parallel, though.

This function can increase performance and efficiency significantly for
lengths that would otherwise require a 4-block function.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-avx2-x86_64.S | 197 +
 arch/x86/crypto/chacha20_glue.c|   7 +
 2 files changed, 204 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S 
b/arch/x86/crypto/chacha20-avx2-x86_64.S
index 7b62d55bee3d..8247076b0ba7 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -26,8 +26,205 @@ ROT16:  .octa 0x0d0c0f0e09080b0a0504070601000302
 CTRINC:.octa 0x000300020001
.octa 0x0007000600050004
 
+.section   .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:.octa 0x
+   .octa 0x0001
+
 .text
 
+ENTRY(chacha20_2block_xor_avx2)
+   # %rdi: Input state matrix, s
+   # %rsi: up to 2 data blocks output, o
+   # %rdx: up to 2 data blocks input, i
+   # %rcx: input/output length in bytes
+
+   # This function encrypts two ChaCha20 blocks by loading the state
+   # matrix twice across four AVX registers. It performs matrix operations
+   # on four words in each matrix in parallel, but requires shuffling to
+   # rearrange the words after each round.
+
+   vzeroupper
+
+   # x0..3[0-2] = s0..3
+   vbroadcasti128  0x00(%rdi),%ymm0
+   vbroadcasti128  0x10(%rdi),%ymm1
+   vbroadcasti128  0x20(%rdi),%ymm2
+   vbroadcasti128  0x30(%rdi),%ymm3
+
+   vpaddd  CTR2BL(%rip),%ymm3,%ymm3
+
+   vmovdqa %ymm0,%ymm8
+   vmovdqa %ymm1,%ymm9
+   vmovdqa %ymm2,%ymm10
+   vmovdqa %ymm3,%ymm11
+
+   vmovdqa ROT8(%rip),%ymm4
+   vmovdqa ROT16(%rip),%ymm5
+
+   mov %rcx,%rax
+   mov $10,%ecx
+
+.Ldoubleround:
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm5,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm6
+   vpslld  $12,%ymm6,%ymm6
+   vpsrld  $20,%ymm1,%ymm1
+   vpor%ymm6,%ymm1,%ymm1
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm4,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm7
+   vpslld  $7,%ymm7,%ymm7
+   vpsrld  $25,%ymm1,%ymm1
+   vpor%ymm7,%ymm1,%ymm1
+
+   # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+   vpshufd $0x39,%ymm1,%ymm1
+   # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+   vpshufd $0x4e,%ymm2,%ymm2
+   # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+   vpshufd $0x93,%ymm3,%ymm3
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm5,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm6
+   vpslld  $12,%ymm6,%ymm6
+   vpsrld  $20,%ymm1,%ymm1
+   vpor%ymm6,%ymm1,%ymm1
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm4,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm7
+   vpslld  $7,%ymm7,%ymm7
+   vpsrld  $25,%ymm1,%ymm1
+   vpor%ymm7,%ymm1,%ymm1
+
+   # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+   vpshufd $0x93,%ymm1,%ymm1
+   # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+   vpshufd $0x4e,%ymm2,%ymm2
+   # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+   vpshufd $0x39,%ymm3,%ymm3
+
+   dec %ecx
+   jnz .Ldoubleround
+
+   # o0 = i0 ^ (x0 + s0)
+   vpaddd  %ymm8,%ymm0,%ymm7
+   cmp $0x10,%rax
+   jl  .Lxorpart2
+   vpxor   0x00(%rdx),%xmm7,%xmm6
+   vmovdqu %xmm6,0x00(%rsi)
+   vextracti128$1,%ymm7,%xmm0
+   # o1 = i1 ^ (x1 + s1)
+   vpaddd  %ymm9,%ymm1,%ymm7
+

Re: [RFC PATCH] crypto: chacha20 - add implementation using 96-bit nonce

2017-12-10 Thread Martin Willi
Hi,

> Anyway, I actually thought it was intentional that the ChaCha
> implementations in the Linux kernel allowed specifying the block
> counter, and therefore allowed seeking to any point in the keystream,
> exposing the full functionality of the cipher.

If I remember correctly, it was indeed intentional. When building the
chacha20poly1305 AEAD both in [1] and [2], a block counter of 0 is used
to generate the Poly1305 key. For the ChaCha20 encryption, an explicit
initial block counter of 1 is used to avoid reusing the same counter.

Maybe it would be possible to implement this with implicit counters,
but doing this explicitly looked much clearer to me. So I guess there
are use cases for explicit block counters in ChaCha20.

Best regards
Martin

[1] https://tools.ietf.org/html/rfc7539#section-2.8
[2] https://tools.ietf.org/html/rfc7634#section-2


Re: [PATCH v4] poly1305: generic C can be faster on chips with slow unaligned access

2016-11-08 Thread Martin Willi

> By using the unaligned access helpers, we drastically improve
> performance on small MIPS routers that have to go through the
> exception fix-up handler for these unaligned accesses.

I couldn't measure any slowdown here, so:

Acked-by: Martin Willi <mar...@strongswan.org>

> -   dctx->s[0] = le32_to_cpuvp(key +  0);
> +   dctx->s[0] = get_unaligned_le32(key +  0);

Not sure what the exact alignment rules for key/iv are, but maybe we
want to replace the same function in chacha20_generic.c as well?

Martin
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] crypto: chacha20_4block_xor_ssse3: Align stack pointer to 64 bytes

2016-01-22 Thread Martin Willi
Hi Eli,

> This aligns the stack pointer in chacha20_4block_xor_ssse3 to 64 bytes.
> Fixes general protection faults and potential kernel panics.

I assumed 16-byte alignment according to the System V AMD64 ABI, but
this is obviously not true with -mpreferred-stack-boundary=3. The AVX2
version seems to be ok, so is Poly1305.

Acked-by: Martin Willi <mar...@strongswan.org>


--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 03/10] crypto: chacha20 - Add a SSSE3 SIMD variant for x86_64

2015-07-16 Thread Martin Willi
Implements an x86_64 assembler driver for the ChaCha20 stream cipher. This
single block variant works on a single state matrix using SSE instructions.
It requires SSSE3 due the use of pshufb for efficient 8/16-bit rotate
operations.

For large messages, throughput increases by ~65% compared to
chacha20-generic:

testing speed of chacha20 (chacha20-generic) encryption
test 0 (256 bit key, 16 byte blocks): 45089207 operations in 10 seconds 
(721427312 bytes)
test 1 (256 bit key, 64 byte blocks): 43839521 operations in 10 seconds 
(2805729344 bytes)
test 2 (256 bit key, 256 byte blocks): 12702056 operations in 10 seconds 
(3251726336 bytes)
test 3 (256 bit key, 1024 byte blocks): 3371173 operations in 10 seconds 
(3452081152 bytes)
test 4 (256 bit key, 8192 byte blocks): 422468 operations in 10 seconds 
(3460857856 bytes)

testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 43141886 operations in 10 seconds 
(690270176 bytes)
test 1 (256 bit key, 64 byte blocks): 46845874 operations in 10 seconds 
(2998135936 bytes)
test 2 (256 bit key, 256 byte blocks): 18458512 operations in 10 seconds 
(4725379072 bytes)
test 3 (256 bit key, 1024 byte blocks): 5360533 operations in 10 seconds 
(5489185792 bytes)
test 4 (256 bit key, 8192 byte blocks): 692846 operations in 10 seconds 
(5675794432 bytes)

Benchmark results from a Core i5-4670T.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 arch/x86/crypto/Makefile|   2 +
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 142 
 arch/x86/crypto/chacha20_glue.c | 123 +++
 crypto/Kconfig  |  15 
 4 files changed, 282 insertions(+)
 create mode 100644 arch/x86/crypto/chacha20-ssse3-x86_64.S
 create mode 100644 arch/x86/crypto/chacha20_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5a4a089..b09e9a4 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -60,6 +61,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
+chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
 ifeq ($(avx_supported),yes)
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S 
b/arch/x86/crypto/chacha20-ssse3-x86_64.S
new file mode 100644
index 000..1b97ad0
--- /dev/null
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -0,0 +1,142 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include linux/linkage.h
+
+.data
+.align 16
+
+ROT8:  .octa 0x0e0d0c0f0a09080b0605040702010003
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+
+.text
+
+ENTRY(chacha20_block_xor_ssse3)
+   # %rdi: Input state matrix, s
+   # %rsi: 1 data block output, o
+   # %rdx: 1 data block input, i
+
+   # This function encrypts one ChaCha20 block by loading the state matrix
+   # in four SSE registers. It performs matrix operation on four words in
+   # parallel, but requireds shuffling to rearrange the words after each
+   # round. 8/16-bit word rotation is done with the slightly better
+   # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
+   # traditional shift+OR.
+
+   # x0..3 = s0..3
+   movdqa  0x00(%rdi),%xmm0
+   movdqa  0x10(%rdi),%xmm1
+   movdqa  0x20(%rdi),%xmm2
+   movdqa  0x30(%rdi),%xmm3
+   movdqa  %xmm0,%xmm8
+   movdqa  %xmm1,%xmm9
+   movdqa  %xmm2,%xmm10
+   movdqa  %xmm3,%xmm11
+
+   movdqa  ROT8(%rip),%xmm4
+   movdqa  ROT16(%rip),%xmm5
+
+   mov $10,%ecx
+
+.Ldoubleround:
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   paddd   %xmm1,%xmm0
+   pxor%xmm0,%xmm3
+   pshufb  %xmm5,%xmm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   paddd   %xmm3,%xmm2
+   pxor

[PATCH v2 01/10] crypto: tcrypt - Add ChaCha20/Poly1305 speed tests

2015-07-16 Thread Martin Willi
Adds individual ChaCha20 and Poly1305 and a combined rfc7539esp AEAD speed
test using mode numbers 214, 321 and 213. For Poly1305 we add a specific
speed template, as it expects the key prepended to the input data.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/tcrypt.c | 15 +++
 crypto/tcrypt.h | 20 
 2 files changed, 35 insertions(+)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 73ed4f2..e9a05ba 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1793,6 +1793,17 @@ static int do_test(const char *alg, u32 type, u32 mask, 
int m)
NULL, 0, 16, 16, aead_speed_template_19);
break;
 
+   case 213:
+   test_aead_speed(rfc7539esp(chacha20,poly1305), ENCRYPT, sec,
+   NULL, 0, 16, 8, aead_speed_template_36);
+   break;
+
+   case 214:
+   test_cipher_speed(chacha20, ENCRYPT, sec, NULL, 0,
+ speed_template_32);
+   break;
+
+
case 300:
if (alg) {
test_hash_speed(alg, sec, generic_hash_speed_template);
@@ -1881,6 +1892,10 @@ static int do_test(const char *alg, u32 type, u32 mask, 
int m)
test_hash_speed(crct10dif, sec, generic_hash_speed_template);
if (mode  300  mode  400) break;
 
+   case 321:
+   test_hash_speed(poly1305, sec, poly1305_speed_template);
+   if (mode  300  mode  400) break;
+
case 399:
break;
 
diff --git a/crypto/tcrypt.h b/crypto/tcrypt.h
index 6cc1b85..f0bfee1 100644
--- a/crypto/tcrypt.h
+++ b/crypto/tcrypt.h
@@ -61,12 +61,14 @@ static u8 speed_template_32_40_48[] = {32, 40, 48, 0};
 static u8 speed_template_32_48[] = {32, 48, 0};
 static u8 speed_template_32_48_64[] = {32, 48, 64, 0};
 static u8 speed_template_32_64[] = {32, 64, 0};
+static u8 speed_template_32[] = {32, 0};
 
 /*
  * AEAD speed tests
  */
 static u8 aead_speed_template_19[] = {19, 0};
 static u8 aead_speed_template_20[] = {20, 0};
+static u8 aead_speed_template_36[] = {36, 0};
 
 /*
  * Digest speed tests
@@ -127,4 +129,22 @@ static struct hash_speed hash_speed_template_16[] = {
{  .blen = 0,   .plen = 0,  .klen = 0, }
 };
 
+static struct hash_speed poly1305_speed_template[] = {
+   { .blen = 96,   .plen = 16, },
+   { .blen = 96,   .plen = 32, },
+   { .blen = 96,   .plen = 96, },
+   { .blen = 288,  .plen = 16, },
+   { .blen = 288,  .plen = 32, },
+   { .blen = 288,  .plen = 288, },
+   { .blen = 1056, .plen = 32, },
+   { .blen = 1056, .plen = 1056, },
+   { .blen = 2080, .plen = 32, },
+   { .blen = 2080, .plen = 2080, },
+   { .blen = 4128, .plen = 4128, },
+   { .blen = 8224, .plen = 8224, },
+
+   /* End marker */
+   {  .blen = 0,   .plen = 0, }
+};
+
 #endif /* _CRYPTO_TCRYPT_H */
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 06/10] crypto: testmgr - Add a longer ChaCha20 test vector

2015-07-16 Thread Martin Willi
The AVX2 variant of ChaCha20 is used only for messages with = 512 bytes
length. With the existing test vectors, the implementation could not be
tested. Due that lack of such a long official test vector, this one is
self-generated using chacha20-generic.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/testmgr.h | 334 ++-
 1 file changed, 333 insertions(+), 1 deletion(-)

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 754e47f..9ca350c 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -30216,7 +30216,7 @@ static struct cipher_testvec 
salsa20_stream_enc_tv_template[] = {
},
 };
 
-#define CHACHA20_ENC_TEST_VECTORS 3
+#define CHACHA20_ENC_TEST_VECTORS 4
 static struct cipher_testvec chacha20_enc_tv_template[] = {
{ /* RFC7539 A.2. Test Vector #1 */
.key= \x00\x00\x00\x00\x00\x00\x00\x00
@@ -30390,6 +30390,338 @@ static struct cipher_testvec 
chacha20_enc_tv_template[] = {
  \x87\xb5\x8d\xfd\x72\x8a\xfa\x36
  \x75\x7a\x79\x7a\xc1\x88\xd1,
.rlen   = 127,
+   }, { /* Self-made test vector for long data */
+   .key= \x1c\x92\x40\xa5\xeb\x55\xd3\x8a
+ \xf3\x33\x88\x86\x04\xf6\xb5\xf0
+ \x47\x39\x17\xc1\x40\x2b\x80\x09
+ \x9d\xca\x5c\xbc\x20\x70\x75\xc0,
+   .klen   = 32,
+   .iv = \x1c\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x01,
+   .input  = \x49\xee\xe0\xdc\x24\x90\x40\xcd
+ \xc5\x40\x8f\x47\x05\xbc\xdd\x81
+ \x47\xc6\x8d\xe6\xb1\x8f\xd7\xcb
+ \x09\x0e\x6e\x22\x48\x1f\xbf\xb8
+ \x5c\xf7\x1e\x8a\xc1\x23\xf2\xd4
+ \x19\x4b\x01\x0f\x4e\xa4\x43\xce
+ \x01\xc6\x67\xda\x03\x91\x18\x90
+ \xa5\xa4\x8e\x45\x03\xb3\x2d\xac
+ \x74\x92\xd3\x53\x47\xc8\xdd\x25
+ \x53\x6c\x02\x03\x87\x0d\x11\x0c
+ \x58\xe3\x12\x18\xfd\x2a\x5b\x40
+ \x0c\x30\xf0\xb8\x3f\x43\xce\xae
+ \x65\x3a\x7d\x7c\xf4\x54\xaa\xcc
+ \x33\x97\xc3\x77\xba\xc5\x70\xde
+ \xd7\xd5\x13\xa5\x65\xc4\x5f\x0f
+ \x46\x1a\x0d\x97\xb5\xf3\xbb\x3c
+ \x84\x0f\x2b\xc5\xaa\xea\xf2\x6c
+ \xc9\xb5\x0c\xee\x15\xf3\x7d\xbe
+ \x9f\x7b\x5a\xa6\xae\x4f\x83\xb6
+ \x79\x49\x41\xf4\x58\x18\xcb\x86
+ \x7f\x30\x0e\xf8\x7d\x44\x36\xea
+ \x75\xeb\x88\x84\x40\x3c\xad\x4f
+ \x6f\x31\x6b\xaa\x5d\xe5\xa5\xc5
+ \x21\x66\xe9\xa7\xe3\xb2\x15\x88
+ \x78\xf6\x79\xa1\x59\x47\x12\x4e
+ \x9f\x9f\x64\x1a\xa0\x22\x5b\x08
+ \xbe\x7c\x36\xc2\x2b\x66\x33\x1b
+ \xdd\x60\x71\xf7\x47\x8c\x61\xc3
+ \xda\x8a\x78\x1e\x16\xfa\x1e\x86
+ \x81\xa6\x17\x2a\xa7\xb5\xc2\xe7
+ \xa4\xc7\x42\xf1\xcf\x6a\xca\xb4
+ \x45\xcf\xf3\x93\xf0\xe7\xea\xf6
+ \xf4\xe6\x33\x43\x84\x93\xa5\x67
+ \x9b\x16\x58\x58\x80\x0f\x2b\x5c
+ \x24\x74\x75\x7f\x95\x81\xb7\x30
+ \x7a\x33\xa7\xf7\x94\x87\x32\x27
+ \x10\x5d\x14\x4c\x43\x29\xdd\x26
+ \xbd\x3e\x3c\x0e\xfe\x0e\xa5\x10
+ \xea\x6b\x64\xfd\x73\xc6\xed\xec
+ \xa8\xc9\xbf\xb3\xba\x0b\x4d\x07
+ \x70\xfc\x16\xfd\x79\x1e\xd7\xc5
+ \x49\x4e\x1c\x8b\x8d\x79\x1b\xb1
+ \xec\xca\x60\x09\x4c\x6a\xd5\x09
+ \x49\x46\x00\x88\x22\x8d\xce\xea
+ \xb1\x17\x11\xde\x42\xd2\x23\xc1
+ \x72\x11\xf5\x50\x73\x04\x40\x47
+ \xf9\x5d\xe7\xa7\x26\xb1\x7e\xb0
+ \x3f\x58\xc1\x52\xab\x12\x67\x9d
+ \x3f\x43\x4b\x68\xd4\x9c\x68\x38
+ \x07\x8a\x2d\x3e\xf3\xaf\x6a\x4b
+ \xf9\xe5\x31\x69\x22\xf9\xa6\x69
+ \xc6\x9c\x96\x9a\x12\x35\x95\x1d
+ \x95\xd5\xdd\xbe\xbf\x93\x53\x24
+ \xfd\xeb\xc2\x0a\x64\xb0\x77\x00
+ \x6f\x88\xc4\x37\x18\x69\x7c\xd7
+ \x41\x92\x55\x4c\x03\xa1\x9a\x4b
+ \x15\xe5\xdf\x7f\x37\x33\x72\xc1
+ \x8b\x10\x67\xa3\x01\x57

[PATCH v2 07/10] crypto: poly1305 - Export common Poly1305 helpers

2015-07-16 Thread Martin Willi
As architecture specific drivers need a software fallback, export Poly1305
init/update/final functions together with some helpers in a header file.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/chacha20poly1305.c |  4 +--
 crypto/poly1305_generic.c | 73 +++
 include/crypto/poly1305.h | 41 ++
 3 files changed, 77 insertions(+), 41 deletions(-)
 create mode 100644 include/crypto/poly1305.h

diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
index 410554d..b71445f 100644
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
@@ -14,6 +14,7 @@
 #include crypto/internal/skcipher.h
 #include crypto/scatterwalk.h
 #include crypto/chacha20.h
+#include crypto/poly1305.h
 #include linux/err.h
 #include linux/init.h
 #include linux/kernel.h
@@ -21,9 +22,6 @@
 
 #include internal.h
 
-#define POLY1305_BLOCK_SIZE16
-#define POLY1305_DIGEST_SIZE   16
-#define POLY1305_KEY_SIZE  32
 #define CHACHAPOLY_IV_SIZE 12
 
 struct chachapoly_instance_ctx {
diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
index 387b5c8..2df9835d 100644
--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
@@ -13,31 +13,11 @@
 
 #include crypto/algapi.h
 #include crypto/internal/hash.h
+#include crypto/poly1305.h
 #include linux/crypto.h
 #include linux/kernel.h
 #include linux/module.h
 
-#define POLY1305_BLOCK_SIZE16
-#define POLY1305_KEY_SIZE  32
-#define POLY1305_DIGEST_SIZE   16
-
-struct poly1305_desc_ctx {
-   /* key */
-   u32 r[5];
-   /* finalize key */
-   u32 s[4];
-   /* accumulator */
-   u32 h[5];
-   /* partial buffer */
-   u8 buf[POLY1305_BLOCK_SIZE];
-   /* bytes used in partial buffer */
-   unsigned int buflen;
-   /* r key has been set */
-   bool rset;
-   /* s key has been set */
-   bool sset;
-};
-
 static inline u64 mlt(u64 a, u64 b)
 {
return a * b;
@@ -58,7 +38,7 @@ static inline u32 le32_to_cpuvp(const void *p)
return le32_to_cpup(p);
 }
 
-static int poly1305_init(struct shash_desc *desc)
+int crypto_poly1305_init(struct shash_desc *desc)
 {
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 
@@ -69,8 +49,9 @@ static int poly1305_init(struct shash_desc *desc)
 
return 0;
 }
+EXPORT_SYMBOL_GPL(crypto_poly1305_init);
 
-static int poly1305_setkey(struct crypto_shash *tfm,
+int crypto_poly1305_setkey(struct crypto_shash *tfm,
   const u8 *key, unsigned int keylen)
 {
/* Poly1305 requires a unique key for each tag, which implies that
@@ -79,6 +60,7 @@ static int poly1305_setkey(struct crypto_shash *tfm,
 * the update() call. */
return -ENOTSUPP;
 }
+EXPORT_SYMBOL_GPL(crypto_poly1305_setkey);
 
 static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key)
 {
@@ -98,16 +80,10 @@ static void poly1305_setskey(struct poly1305_desc_ctx 
*dctx, const u8 *key)
dctx-s[3] = le32_to_cpuvp(key + 12);
 }
 
-static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
-   const u8 *src, unsigned int srclen,
-   u32 hibit)
+unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+   const u8 *src, unsigned int srclen)
 {
-   u32 r0, r1, r2, r3, r4;
-   u32 s1, s2, s3, s4;
-   u32 h0, h1, h2, h3, h4;
-   u64 d0, d1, d2, d3, d4;
-
-   if (unlikely(!dctx-sset)) {
+   if (!dctx-sset) {
if (!dctx-rset  srclen = POLY1305_BLOCK_SIZE) {
poly1305_setrkey(dctx, src);
src += POLY1305_BLOCK_SIZE;
@@ -121,6 +97,25 @@ static unsigned int poly1305_blocks(struct 
poly1305_desc_ctx *dctx,
dctx-sset = true;
}
}
+   return srclen;
+}
+EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
+
+static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
+   const u8 *src, unsigned int srclen,
+   u32 hibit)
+{
+   u32 r0, r1, r2, r3, r4;
+   u32 s1, s2, s3, s4;
+   u32 h0, h1, h2, h3, h4;
+   u64 d0, d1, d2, d3, d4;
+   unsigned int datalen;
+
+   if (unlikely(!dctx-sset)) {
+   datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+   src += srclen - datalen;
+   srclen = datalen;
+   }
 
r0 = dctx-r[0];
r1 = dctx-r[1];
@@ -181,7 +176,7 @@ static unsigned int poly1305_blocks(struct 
poly1305_desc_ctx *dctx,
return srclen;
 }
 
-static int poly1305_update(struct shash_desc *desc,
+int crypto_poly1305_update(struct shash_desc *desc,
   const u8 *src, unsigned int srclen)
 {
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
@@ -214,8 +209,9 @@ static int poly1305_update(struct shash_desc *desc

[PATCH v2 10/10] crypto: poly1305 - Add a four block AVX2 variant for x86_64

2015-07-16 Thread Martin Willi
Extends the x86_64 Poly1305 authenticator by a function processing four
consecutive Poly1305 blocks in parallel using AVX2 instructions.

For large messages, throughput increases by ~15-45% compared to two
block SSE2:

testing speed of poly1305 (poly1305-simd)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 3809514 
opers/sec,  365713411 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 5973423 
opers/sec,  573448627 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9446779 
opers/sec,  906890803 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1364814 
opers/sec,  393066691 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 2045780 
opers/sec,  589184697 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3711946 
opers/sec, 1069040592 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  573686 
opers/sec,  605812732 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 1647802 
opers/sec, 1740079440 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  292970 
opers/sec,  609378224 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates):  943229 
opers/sec, 1961916528 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  494623 
opers/sec, 2041804569 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  254045 
opers/sec, 2089271014 bytes/sec

testing speed of poly1305 (poly1305-simd)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 3826224 
opers/sec,  367317552 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 5948638 
opers/sec,  571069267 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9439110 
opers/sec,  906154627 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1367756 
opers/sec,  393913872 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 2056881 
opers/sec,  592381958 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3711153 
opers/sec, 1068812179 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  574940 
opers/sec,  607136745 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 1948830 
opers/sec, 2057964585 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  293308 
opers/sec,  610082096 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates): 1235224 
opers/sec, 2569267792 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  684405 
opers/sec, 2825226316 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  367101 
opers/sec, 3019039446 bytes/sec

Benchmark results from a Core i5-4670T.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 arch/x86/crypto/Makefile   |   1 +
 arch/x86/crypto/poly1305-avx2-x86_64.S | 386 +
 arch/x86/crypto/poly1305_glue.c|  40 
 crypto/Kconfig |   2 +-
 4 files changed, 428 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5cf405c..9a2838c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -89,6 +89,7 @@ sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
 ifeq ($(avx2_supported),yes)
 sha1-ssse3-y += sha1_avx2_x86_64_asm.o
+poly1305-x86_64-y += poly1305-avx2-x86_64.o
 endif
 crc32c-intel-y := crc32c-intel_glue.o
 crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S 
b/arch/x86/crypto/poly1305-avx2-x86_64.S
new file mode 100644
index 000..eff2f41
--- /dev/null
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -0,0 +1,386 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include linux/linkage.h
+
+.data
+.align 32
+
+ANMASK:.octa 0x03ff03ff
+   .octa 0x03ff03ff
+ORMASK:.octa 0x01000100
+   .octa 0x01000100
+
+.text
+
+#define h0 0x00(%rdi)
+#define h1 0x04(%rdi)
+#define h2 0x08(%rdi)
+#define h3 0x0c(%rdi)
+#define h4 0x10(%rdi)
+#define r0 0x00(%rdx)
+#define r1 0x04(%rdx)
+#define r2 0x08(%rdx)
+#define r3 0x0c(%rdx)
+#define r4 0x10(%rdx)
+#define u0 0x00(%r8)
+#define u1 0x04(%r8)
+#define u2 0x08(%r8)
+#define u3 0x0c(%r8)
+#define u4 0x10(%r8

[PATCH v2 05/10] crypto: chacha20 - Add an eight block AVX2 variant for x86_64

2015-07-16 Thread Martin Willi
Extends the x86_64 ChaCha20 implementation by a function processing eight
ChaCha20 blocks in parallel using AVX2.

For large messages, throughput increases by ~55-70% compared to four block
SSSE3:

testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 42249230 operations in 10 seconds 
(675987680 bytes)
test 1 (256 bit key, 64 byte blocks): 46441641 operations in 10 seconds 
(2972265024 bytes)
test 2 (256 bit key, 256 byte blocks): 33028112 operations in 10 seconds 
(8455196672 bytes)
test 3 (256 bit key, 1024 byte blocks): 11568759 operations in 10 seconds 
(11846409216 bytes)
test 4 (256 bit key, 8192 byte blocks): 1448761 operations in 10 seconds 
(11868250112 bytes)

testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 41999675 operations in 10 seconds 
(671994800 bytes)
test 1 (256 bit key, 64 byte blocks): 45805908 operations in 10 seconds 
(2931578112 bytes)
test 2 (256 bit key, 256 byte blocks): 32814947 operations in 10 seconds 
(8400626432 bytes)
test 3 (256 bit key, 1024 byte blocks): 19777167 operations in 10 seconds 
(20251819008 bytes)
test 4 (256 bit key, 8192 byte blocks): 2279321 operations in 10 seconds 
(18672197632 bytes)

Benchmark results from a Core i5-4670T.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 arch/x86/crypto/Makefile   |   1 +
 arch/x86/crypto/chacha20-avx2-x86_64.S | 443 +
 arch/x86/crypto/chacha20_glue.c|  19 ++
 crypto/Kconfig |   2 +-
 4 files changed, 464 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/crypto/chacha20-avx2-x86_64.S

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index b09e9a4..ce39b3c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -77,6 +77,7 @@ endif
 
 ifeq ($(avx2_supported),yes)
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o 
camellia_aesni_avx2_glue.o
+   chacha20-x86_64-y += chacha20-avx2-x86_64.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 endif
 
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S 
b/arch/x86/crypto/chacha20-avx2-x86_64.S
new file mode 100644
index 000..16694e6
--- /dev/null
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -0,0 +1,443 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include linux/linkage.h
+
+.data
+.align 32
+
+ROT8:  .octa 0x0e0d0c0f0a09080b0605040702010003
+   .octa 0x0e0d0c0f0a09080b0605040702010003
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+   .octa 0x0d0c0f0e09080b0a0504070601000302
+CTRINC:.octa 0x000300020001
+   .octa 0x0007000600050004
+
+.text
+
+ENTRY(chacha20_8block_xor_avx2)
+   # %rdi: Input state matrix, s
+   # %rsi: 8 data blocks output, o
+   # %rdx: 8 data blocks input, i
+
+   # This function encrypts eight consecutive ChaCha20 blocks by loading
+   # the state matrix in AVX registers eight times. As we need some
+   # scratch registers, we save the first four registers on the stack. The
+   # algorithm performs each operation on the corresponding word of each
+   # state matrix, hence requires no word shuffling. For final XORing step
+   # we transpose the matrix by interleaving 32-, 64- and then 128-bit
+   # words, which allows us to do XOR in AVX registers. 8/16-bit word
+   # rotation is done with the slightly better performing byte shuffling,
+   # 7/12-bit word rotation uses traditional shift+OR.
+
+   vzeroupper
+   # 4 * 32 byte stack, 32-byte aligned
+   mov %rsp, %r8
+   and $~31, %rsp
+   sub $0x80, %rsp
+
+   # x0..15[0-7] = s[0..15]
+   vpbroadcastd0x00(%rdi),%ymm0
+   vpbroadcastd0x04(%rdi),%ymm1
+   vpbroadcastd0x08(%rdi),%ymm2
+   vpbroadcastd0x0c(%rdi),%ymm3
+   vpbroadcastd0x10(%rdi),%ymm4
+   vpbroadcastd0x14(%rdi),%ymm5
+   vpbroadcastd0x18(%rdi),%ymm6
+   vpbroadcastd0x1c(%rdi),%ymm7
+   vpbroadcastd0x20(%rdi),%ymm8
+   vpbroadcastd0x24(%rdi),%ymm9
+   vpbroadcastd0x28(%rdi),%ymm10
+   vpbroadcastd0x2c(%rdi),%ymm11
+   vpbroadcastd0x30(%rdi),%ymm12
+   vpbroadcastd0x34(%rdi),%ymm13
+   vpbroadcastd0x38(%rdi),%ymm14
+   vpbroadcastd0x3c(%rdi),%ymm15
+   # x0..3 on stack
+   vmovdqa %ymm0,0x00(%rsp)
+   vmovdqa %ymm1,0x20(%rsp)
+   vmovdqa %ymm2,0x40(%rsp)
+   vmovdqa %ymm3,0x60(%rsp)
+
+   vmovdqa CTRINC(%rip),%ymm1

[PATCH v2 00/10] crypto: x86_64 - Add SSE/AVX2 ChaCha20/Poly1305 ciphers

2015-07-16 Thread Martin Willi
This patch series adds both ChaCha20 and Poly1305 specific ciphers for
x86_64 using SSE2/SSSE3 and AVX2 instructions. The idea is to have a drop-in
replacement for AESNI/CLMUL-accelerated AES-GCM providing at least somewhat
comparable performance, refer to RFC7539 for details. It is based on cryptodev,
including the ChaCha20/Poly1305 AEAD interface conversion patch.

The first patch adds some speed tests to tcrypt. The second patch exports
some functionality from chacha20-generic to use it as fallback. Patch 3
adds a single block SSSE3 driver for ChaCha20, while patch 4 and 5 extend it
by an optimized four block SSSE3 and an eight block AVX2 variant. Patch 6
adds an additional test vector for ChaCha20 to actually test the AVX2 eight
block variant processing 512-bytes at once.

Patch 7 exports some poly1305-generic functionality to use it as fallback.
Patch 8 introduces a single block SSE2 driver for Poly1305, while patch 9
and 10 add an optimized two block SSE2 and a four block AVX2 variant.

Overall speedup for the ChaCha20/Poly1305 AEAD for typical IPsec payloads
is ~50-150% with SSE2/SSSE3 and ~100-200% with AVX2, or even more for larger
payloads:

generic:
testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-generic,poly1305-generic)) encryption
test 0 (288 bit key, 16 byte blocks): 10456041 operations in 10 seconds 
(167296656 bytes)
test 1 (288 bit key, 64 byte blocks): 411 operations in 10 seconds 
(639962304 bytes)
test 2 (288 bit key, 256 byte blocks): 5793012 operations in 10 seconds 
(1483011072 bytes)
test 3 (288 bit key, 512 byte blocks): 3743676 operations in 10 seconds 
(1916762112 bytes)
test 4 (288 bit key, 1024 byte blocks): 2190023 operations in 10 seconds 
(2242583552 bytes)
test 5 (288 bit key, 2048 byte blocks): 1195864 operations in 10 seconds 
(2449129472 bytes)
test 6 (288 bit key, 4096 byte blocks): 627625 operations in 10 seconds 
(2570752000 bytes)
test 7 (288 bit key, 8192 byte blocks): 319844 operations in 10 seconds 
(2620162048 bytes)

SSE2/SSSE3:
testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-simd,poly1305-simd)) encryption
test 0 (288 bit key, 16 byte blocks): 10077910 operations in 10 seconds 
(161246560 bytes)
test 1 (288 bit key, 64 byte blocks): 9990400 operations in 10 seconds 
(639385600 bytes)
test 2 (288 bit key, 256 byte blocks): 7953774 operations in 10 seconds 
(2036166144 bytes)
test 3 (288 bit key, 512 byte blocks): 6351059 operations in 10 seconds 
(3251742208 bytes)
test 4 (288 bit key, 1024 byte blocks): 4593059 operations in 10 seconds 
(4703292416 bytes)
test 5 (288 bit key, 2048 byte blocks): 2956300 operations in 10 seconds 
(6054502400 bytes)
test 6 (288 bit key, 4096 byte blocks): 1724958 operations in 10 seconds 
(7065427968 bytes)
test 7 (288 bit key, 8192 byte blocks): 925156 operations in 10 seconds 
(7578877952 bytes)

AVX2:
testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-simd,poly1305-simd)) encryption
test 0 (288 bit key, 16 byte blocks): 10006774 operations in 10 seconds 
(160108384 bytes)
test 1 (288 bit key, 64 byte blocks): 9896498 operations in 10 seconds 
(633375872 bytes)
test 2 (288 bit key, 256 byte blocks): 7922198 operations in 10 seconds 
(2028082688 bytes)
test 3 (288 bit key, 512 byte blocks): 7261666 operations in 10 seconds 
(3717972992 bytes)
test 4 (288 bit key, 1024 byte blocks): 5835006 operations in 10 seconds 
(5975046144 bytes)
test 5 (288 bit key, 2048 byte blocks): 4172937 operations in 10 seconds 
(8546174976 bytes)
test 6 (288 bit key, 4096 byte blocks): 2670484 operations in 10 seconds 
(10938302464 bytes)
test 7 (288 bit key, 8192 byte blocks): 1504684 operations in 10 seconds 
(12326371328 bytes)

All benchmark results from a Core i5-4670T.

The ChaCha20/Poly1305 AEAD on Haswell with AVX2 has about half the raw
AESNI/CLMUL-accelerated AES-GCM (rfc4106-gcm-aesni) performance for typical
IPsec MTUs. On Ivy Bridge using SSE2/SSSE3 the numbers compared to AES-GCM
are very similar due to the less efficient CLMUL instructions.

Changes in v2:
- No code changes
- Use sec=10 for more reliable benchmark results

Martin Willi (10):
  crypto: tcrypt - Add ChaCha20/Poly1305 speed tests
  crypto: chacha20 - Export common ChaCha20 helpers
  crypto: chacha20 - Add a SSSE3 SIMD variant for x86_64
  crypto: chacha20 - Add a four block SSSE3 variant for x86_64
  crypto: chacha20 - Add an eight block AVX2 variant for x86_64
  crypto: testmgr - Add a longer ChaCha20 test vector
  crypto: poly1305 - Export common Poly1305 helpers
  crypto: poly1305 - Add a SSE2 SIMD variant for x86_64
  crypto: poly1305 - Add a two block SSE2 variant for x86_64
  crypto: poly1305 - Add a four block AVX2 variant for x86_64

 arch/x86/crypto/Makefile|   6 +
 arch/x86/crypto/chacha20-avx2-x86_64.S  | 443 ++
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 625 
 arch/x86/crypto/chacha20_glue.c | 150 
 arch/x86/crypto

[PATCH v2 04/10] crypto: chacha20 - Add a four block SSSE3 variant for x86_64

2015-07-16 Thread Martin Willi
Extends the x86_64 SSSE3 ChaCha20 implementation by a function processing
four ChaCha20 blocks in parallel. This avoids the word shuffling needed
in the single block variant, further increasing throughput.

For large messages, throughput increases by ~110% compared to single block
SSSE3:

testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 43141886 operations in 10 seconds 
(690270176 bytes)
test 1 (256 bit key, 64 byte blocks): 46845874 operations in 10 seconds 
(2998135936 bytes)
test 2 (256 bit key, 256 byte blocks): 18458512 operations in 10 seconds 
(4725379072 bytes)
test 3 (256 bit key, 1024 byte blocks): 5360533 operations in 10 seconds 
(5489185792 bytes)
test 4 (256 bit key, 8192 byte blocks): 692846 operations in 10 seconds 
(5675794432 bytes)

testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 42249230 operations in 10 seconds 
(675987680 bytes)
test 1 (256 bit key, 64 byte blocks): 46441641 operations in 10 seconds 
(2972265024 bytes)
test 2 (256 bit key, 256 byte blocks): 33028112 operations in 10 seconds 
(8455196672 bytes)
test 3 (256 bit key, 1024 byte blocks): 11568759 operations in 10 seconds 
(11846409216 bytes)
test 4 (256 bit key, 8192 byte blocks): 1448761 operations in 10 seconds 
(11868250112 bytes)

Benchmark results from a Core i5-4670T.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 483 
 arch/x86/crypto/chacha20_glue.c |   8 +
 2 files changed, 491 insertions(+)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S 
b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 1b97ad0..712b130 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -16,6 +16,7 @@
 
 ROT8:  .octa 0x0e0d0c0f0a09080b0605040702010003
 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+CTRINC:.octa 0x000300020001
 
 .text
 
@@ -140,3 +141,485 @@ ENTRY(chacha20_block_xor_ssse3)
 
ret
 ENDPROC(chacha20_block_xor_ssse3)
+
+ENTRY(chacha20_4block_xor_ssse3)
+   # %rdi: Input state matrix, s
+   # %rsi: 4 data blocks output, o
+   # %rdx: 4 data blocks input, i
+
+   # This function encrypts four consecutive ChaCha20 blocks by loading the
+   # the state matrix in SSE registers four times. As we need some scratch
+   # registers, we save the first four registers on the stack. The
+   # algorithm performs each operation on the corresponding word of each
+   # state matrix, hence requires no word shuffling. For final XORing step
+   # we transpose the matrix by interleaving 32- and then 64-bit words,
+   # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
+   # done with the slightly better performing SSSE3 byte shuffling,
+   # 7/12-bit word rotation uses traditional shift+OR.
+
+   sub $0x40,%rsp
+
+   # x0..15[0-3] = s0..3[0..3]
+   movq0x00(%rdi),%xmm1
+   pshufd  $0x00,%xmm1,%xmm0
+   pshufd  $0x55,%xmm1,%xmm1
+   movq0x08(%rdi),%xmm3
+   pshufd  $0x00,%xmm3,%xmm2
+   pshufd  $0x55,%xmm3,%xmm3
+   movq0x10(%rdi),%xmm5
+   pshufd  $0x00,%xmm5,%xmm4
+   pshufd  $0x55,%xmm5,%xmm5
+   movq0x18(%rdi),%xmm7
+   pshufd  $0x00,%xmm7,%xmm6
+   pshufd  $0x55,%xmm7,%xmm7
+   movq0x20(%rdi),%xmm9
+   pshufd  $0x00,%xmm9,%xmm8
+   pshufd  $0x55,%xmm9,%xmm9
+   movq0x28(%rdi),%xmm11
+   pshufd  $0x00,%xmm11,%xmm10
+   pshufd  $0x55,%xmm11,%xmm11
+   movq0x30(%rdi),%xmm13
+   pshufd  $0x00,%xmm13,%xmm12
+   pshufd  $0x55,%xmm13,%xmm13
+   movq0x38(%rdi),%xmm15
+   pshufd  $0x00,%xmm15,%xmm14
+   pshufd  $0x55,%xmm15,%xmm15
+   # x0..3 on stack
+   movdqa  %xmm0,0x00(%rsp)
+   movdqa  %xmm1,0x10(%rsp)
+   movdqa  %xmm2,0x20(%rsp)
+   movdqa  %xmm3,0x30(%rsp)
+
+   movdqa  CTRINC(%rip),%xmm1
+   movdqa  ROT8(%rip),%xmm2
+   movdqa  ROT16(%rip),%xmm3
+
+   # x12 += counter values 0-3
+   paddd   %xmm1,%xmm12
+
+   mov $10,%ecx
+
+.Ldoubleround4:
+   # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+   movdqa  0x00(%rsp),%xmm0
+   paddd   %xmm4,%xmm0
+   movdqa  %xmm0,0x00(%rsp)
+   pxor%xmm0,%xmm12
+   pshufb  %xmm3,%xmm12
+   # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+   movdqa  0x10(%rsp),%xmm0
+   paddd   %xmm5,%xmm0
+   movdqa  %xmm0,0x10(%rsp)
+   pxor%xmm0,%xmm13
+   pshufb  %xmm3,%xmm13
+   # x2 += x6, x14 = rotl32(x14 ^ x2, 16

[PATCH v2 09/10] crypto: poly1305 - Add a two block SSE2 variant for x86_64

2015-07-16 Thread Martin Willi
Extends the x86_64 SSE2 Poly1305 authenticator by a function processing two
consecutive Poly1305 blocks in parallel using a derived key r^2. Loop
unrolling can be more effectively mapped to SSE instructions, further
increasing throughput.

For large messages, throughput increases by ~45-65% compared to single
block SSE2:

testing speed of poly1305 (poly1305-simd)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 3790063 
opers/sec,  363846076 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 5913378 
opers/sec,  567684355 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9352574 
opers/sec,  897847104 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1362145 
opers/sec,  392297990 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 2007075 
opers/sec,  578037628 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3709811 
opers/sec, 1068425798 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  566272 
opers/sec,  597984182 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 657 
opers/sec, 1173910108 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  288857 
opers/sec,  600823808 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates):  590746 
opers/sec, 1228751888 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  301825 
opers/sec, 1245936902 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  153075 
opers/sec, 1258896201 bytes/sec

testing speed of poly1305 (poly1305-simd)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 3809514 
opers/sec,  365713411 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 5973423 
opers/sec,  573448627 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9446779 
opers/sec,  906890803 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1364814 
opers/sec,  393066691 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 2045780 
opers/sec,  589184697 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3711946 
opers/sec, 1069040592 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  573686 
opers/sec,  605812732 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 1647802 
opers/sec, 1740079440 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  292970 
opers/sec,  609378224 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates):  943229 
opers/sec, 1961916528 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  494623 
opers/sec, 2041804569 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  254045 
opers/sec, 2089271014 bytes/sec

Benchmark results from a Core i5-4670T.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 arch/x86/crypto/poly1305-sse2-x86_64.S | 306 +
 arch/x86/crypto/poly1305_glue.c|  54 +-
 2 files changed, 355 insertions(+), 5 deletions(-)

diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S 
b/arch/x86/crypto/poly1305-sse2-x86_64.S
index a3d2b5e..338c748 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -15,6 +15,7 @@
 .align 16
 
 ANMASK:.octa 0x03ff03ff
+ORMASK:.octa 0x01000100
 
 .text
 
@@ -274,3 +275,308 @@ ENTRY(poly1305_block_sse2)
pop %rbx
ret
 ENDPROC(poly1305_block_sse2)
+
+
+#define u0 0x00(%r8)
+#define u1 0x04(%r8)
+#define u2 0x08(%r8)
+#define u3 0x0c(%r8)
+#define u4 0x10(%r8)
+#define hc0 %xmm0
+#define hc1 %xmm1
+#define hc2 %xmm2
+#define hc3 %xmm5
+#define hc4 %xmm6
+#define ru0 %xmm7
+#define ru1 %xmm8
+#define ru2 %xmm9
+#define ru3 %xmm10
+#define ru4 %xmm11
+#define sv1 %xmm12
+#define sv2 %xmm13
+#define sv3 %xmm14
+#define sv4 %xmm15
+#undef d0
+#define d0 %r13
+
+ENTRY(poly1305_2block_sse2)
+   # %rdi: Accumulator h[5]
+   # %rsi: 16 byte input block m
+   # %rdx: Poly1305 key r[5]
+   # %rcx: Doubleblock count
+   # %r8:  Poly1305 derived key r^2 u[5]
+
+   # This two-block variant further improves performance by using loop
+   # unrolled block processing. This is more straight forward and does
+   # less byte shuffling, but requires a second Poly1305 key r^2:
+   # h = (h + m) * r=h = (h + m1) * r^2 + m2 * r
+
+   push%rbx
+   push%r12
+   push%r13
+
+   # combine r0,u0
+   movdu0,ru0
+   movdr0,t1
+   punpcklqdq  t1,ru0
+
+   # combine r1,u1 and s1=r1*5,v1=u1*5
+   movdu1,ru1
+   movdr1,t1
+   punpcklqdq  t1,ru1

[PATCH v2 08/10] crypto: poly1305 - Add a SSE2 SIMD variant for x86_64

2015-07-16 Thread Martin Willi
Implements an x86_64 assembler driver for the Poly1305 authenticator. This
single block variant holds the 130-bit integer in 5 32-bit words, but uses
SSE to do two multiplications/additions in parallel.

When calling updates with small blocks, the overhead for kernel_fpu_begin/
kernel_fpu_end() negates the perfmance gain. We therefore use the
poly1305-generic fallback for small updates.

For large messages, throughput increases by ~5-10% compared to
poly1305-generic:

testing speed of poly1305 (poly1305-generic)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 4080026 
opers/sec,  391682496 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 6221094 
opers/sec,  597225024 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9609750 
opers/sec,  922536057 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1459379 
opers/sec,  420301267 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 2115179 
opers/sec,  609171609 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3729874 
opers/sec, 1074203856 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  593000 
opers/sec,  626208000 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 1081536 
opers/sec, 1142102332 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  302077 
opers/sec,  628320576 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates):  554384 
opers/sec, 1153120176 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  278715 
opers/sec, 1150536345 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  140202 
opers/sec, 1153022070 bytes/sec

testing speed of poly1305 (poly1305-simd)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 3790063 
opers/sec,  363846076 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 5913378 
opers/sec,  567684355 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9352574 
opers/sec,  897847104 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1362145 
opers/sec,  392297990 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 2007075 
opers/sec,  578037628 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3709811 
opers/sec, 1068425798 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  566272 
opers/sec,  597984182 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 657 
opers/sec, 1173910108 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  288857 
opers/sec,  600823808 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates):  590746 
opers/sec, 1228751888 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  301825 
opers/sec, 1245936902 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  153075 
opers/sec, 1258896201 bytes/sec

Benchmark results from a Core i5-4670T.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 arch/x86/crypto/Makefile   |   2 +
 arch/x86/crypto/poly1305-sse2-x86_64.S | 276 +
 arch/x86/crypto/poly1305_glue.c| 123 +++
 crypto/Kconfig |  12 ++
 4 files changed, 413 insertions(+)
 create mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S
 create mode 100644 arch/x86/crypto/poly1305_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index ce39b3c..5cf405c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
 obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
+obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
 
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@@ -85,6 +86,7 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
 ifeq ($(avx2_supported),yes)
 sha1-ssse3-y += sha1_avx2_x86_64_asm.o
 endif
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S 
b/arch/x86/crypto/poly1305-sse2-x86_64.S
new file mode 100644
index 000..a3d2b5e
--- /dev/null
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -0,0 +1,276 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General

Re: crypto: chacha20poly1305 - Convert to new AEAD interface

2015-07-16 Thread Martin Willi
Herbert,

 This patch converts rfc7539 and rfc7539esp to the new AEAD interface.
 The test vectors for rfc7539esp have also been updated to include
 the IV.

Thanks for taking care of it, I haven't found the time yet to do it
myself. I can confirm that it works fine under IPsec load, so you may
add my:

Tested-by: Martin Willi mar...@strongswan.org

Regards
Martin

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/10] crypto: x86_64 - Add SSE/AVX2 ChaCha20/Poly1305 ciphers

2015-07-11 Thread Martin Willi

 If you're going to use sec you need to use at least 10 in order
 for it to be meaningful as shorter values often result in bogus
 numbers.

Ok, I'll use sec=10 in v2. There is no fundamental difference compared
to sec=1 (except for very short blocks):

testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-simd,poly1305-simd)) encryption
test 0 (288 bit key, 16 byte blocks): 9498006 operations in 10 seconds 
(151968096 bytes)
test 1 (288 bit key, 64 byte blocks): 9423516 operations in 10 seconds 
(603105024 bytes)
test 2 (288 bit key, 256 byte blocks): 7597253 operations in 10 seconds 
(1944896768 bytes)
test 3 (288 bit key, 512 byte blocks): 6979753 operations in 10 seconds 
(3573633536 bytes)
test 4 (288 bit key, 1024 byte blocks): 5629328 operations in 10 seconds 
(5764431872 bytes)
test 5 (288 bit key, 2048 byte blocks): 4071284 operations in 10 seconds 
(8337989632 bytes)
test 6 (288 bit key, 4096 byte blocks): 2627325 operations in 10 seconds 
(10761523200 bytes)
test 7 (288 bit key, 8192 byte blocks): 1492531 operations in 10 seconds 
(12226813952 bytes)

testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-simd,poly1305-simd)) encryption
test 0 (288 bit key, 16 byte blocks): 896305 operations in 1 seconds (14340880 
bytes)
test 1 (288 bit key, 64 byte blocks): 929638 operations in 1 seconds (59496832 
bytes)
test 2 (288 bit key, 256 byte blocks): 750673 operations in 1 seconds 
(192172288 bytes)
test 3 (288 bit key, 512 byte blocks): 687636 operations in 1 seconds 
(352069632 bytes)
test 4 (288 bit key, 1024 byte blocks): 555209 operations in 1 seconds 
(568534016 bytes)
test 5 (288 bit key, 2048 byte blocks): 402049 operations in 1 seconds 
(823396352 bytes)
test 6 (288 bit key, 4096 byte blocks): 259861 operations in 1 seconds 
(1064390656 bytes)
test 7 (288 bit key, 8192 byte blocks): 147283 operations in 1 seconds 
(1206542336 bytes)

 What sort of variance do you see with cycles?

Here a very fast and a very slow run (these are extremes, though):

testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-simd,poly1305-simd)) encryption
test 0 (288 bit key, 16 byte blocks): 1 operation in 3765 cycles (16 bytes)
test 1 (288 bit key, 64 byte blocks): 1 operation in 3823 cycles (64 bytes)
test 2 (288 bit key, 256 byte blocks): 1 operation in 4728 cycles (256 bytes)
test 3 (288 bit key, 512 byte blocks): 1 operation in 5135 cycles (512 bytes)
test 4 (288 bit key, 1024 byte blocks): 1 operation in 7026 cycles (1024 bytes)
test 5 (288 bit key, 2048 byte blocks): 1 operation in 8804 cycles (2048 bytes)
test 6 (288 bit key, 4096 byte blocks): 1 operation in 14674 cycles (4096 bytes)
test 7 (288 bit key, 8192 byte blocks): 1 operation in 24616 cycles (8192 bytes)

testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-simd,poly1305-simd)) encryption
test 0 (288 bit key, 16 byte blocks): 1 operation in 15031 cycles (16 bytes)
test 1 (288 bit key, 64 byte blocks): 1 operation in 15670 cycles (64 bytes)
test 2 (288 bit key, 256 byte blocks): 1 operation in 13034 cycles (256 bytes)
test 3 (288 bit key, 512 byte blocks): 1 operation in 14045 cycles (512 bytes)
test 4 (288 bit key, 1024 byte blocks): 1 operation in 20944 cycles (1024 bytes)
test 5 (288 bit key, 2048 byte blocks): 1 operation in 26445 cycles (2048 bytes)
test 6 (288 bit key, 4096 byte blocks): 1 operation in 31912 cycles (4096 bytes)
test 7 (288 bit key, 8192 byte blocks): 1 operation in 61366 cycles (8192 bytes)

 Do you get the same variance for other algorithms, e.g., cbc/aes?

Yes, another extreme:

testing speed of cbc(aes) (cbc(aes-aesni)) decryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 593 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 1589 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 5311 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 20666 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 161483 cycles (8192 
bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 593 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 1659 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 5609 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 21568 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 172484 cycles (8192 
bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 612 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 1687 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 5836 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 22400 cycles (1024 
bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 177799 cycles (8192 
bytes)

testing speed of cbc(aes) (cbc(aes-aesni)) decryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 1130 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 

Re: [PATCH 00/10] crypto: x86_64 - Add SSE/AVX2 ChaCha20/Poly1305 ciphers

2015-07-08 Thread Martin Willi
Herbert,

 Running the speed test with sec=1 makes no sense because it's
 too short.  Please use sec=0 and count cycles instead.

I get less constant numbers between different runs when using sec=0,
hence I've used sec=1. Below are the numbers of average runs for the
AEAD measuring cycles; I'll use cycles in the individual patch notes in
a v2.

Kind regards
Martin

--

generic:
testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-generic,poly1305-generic)) encryption
test 0 (288 bit key, 16 byte blocks): 1 operation in 9444 cycles (16 bytes)
test 1 (288 bit key, 64 byte blocks): 1 operation in 10692 cycles (64 bytes)
test 2 (288 bit key, 256 byte blocks): 1 operation in 18299 cycles (256 bytes)
test 3 (288 bit key, 512 byte blocks): 1 operation in 26952 cycles (512 bytes)
test 4 (288 bit key, 1024 byte blocks): 1 operation in 48493 cycles (1024 bytes)
test 5 (288 bit key, 2048 byte blocks): 1 operation in 83766 cycles (2048 bytes)
test 6 (288 bit key, 4096 byte blocks): 1 operation in 150899 cycles (4096 
bytes)
test 7 (288 bit key, 8192 byte blocks): 1 operation in 296779 cycles (8192 
bytes)

SSE2/SSSE3:
testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-simd,poly1305-simd)) encryption
test 0 (288 bit key, 16 byte blocks): 1 operation in 9814 cycles (16 bytes)
test 1 (288 bit key, 64 byte blocks): 1 operation in 9998 cycles (64 bytes)
test 2 (288 bit key, 256 byte blocks): 1 operation in 12442 cycles (256 bytes)
test 3 (288 bit key, 512 byte blocks): 1 operation in 20321 cycles (512 bytes)
test 4 (288 bit key, 1024 byte blocks): 1 operation in 21098 cycles (1024 bytes)
test 5 (288 bit key, 2048 byte blocks): 1 operation in 33423 cycles (2048 bytes)
test 6 (288 bit key, 4096 byte blocks): 1 operation in 55183 cycles (4096 bytes)
test 7 (288 bit key, 8192 byte blocks): 1 operation in 102514 cycles (8192 
bytes)

AVX2:
testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-simd,poly1305-simd)) encryption
test 0 (288 bit key, 16 byte blocks): 1 operation in 9883 cycles (16 bytes)
test 1 (288 bit key, 64 byte blocks): 1 operation in 10891 cycles (64 bytes)
test 2 (288 bit key, 256 byte blocks): 1 operation in 12467 cycles (256 bytes)
test 3 (288 bit key, 512 byte blocks): 1 operation in 13538 cycles (512 bytes)
test 4 (288 bit key, 1024 byte blocks): 1 operation in 16783 cycles (1024 bytes)
test 5 (288 bit key, 2048 byte blocks): 1 operation in 23161 cycles (2048 bytes)
test 6 (288 bit key, 4096 byte blocks): 1 operation in 37359 cycles (4096 bytes)
test 7 (288 bit key, 8192 byte blocks): 1 operation in 64670 cycles (8192 bytes)


--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 00/10] crypto: x86_64 - Add SSE/AVX2 ChaCha20/Poly1305 ciphers

2015-07-07 Thread Martin Willi
This patch series adds both ChaCha20 and Poly1305 specific ciphers for
x86_64 using SSE2/SSSE3 and AVX2 instructions. The idea is to have a drop-in
replacement for AESNI/CLMUL-accelerated AES-GCM providing at least somewhat
comparable performance, refer to RFC7539 for details. It is based
on cryptodev.

The first patch adds some speed tests to tcrypt. The second patch exports
some functionality from chacha20-generic to use it as fallback. Patch 3
adds a single block SSSE3 driver for ChaCha20, while patch 4 and 5 extend it
by an optimized four block SSSE3 and an eight block AVX2 variant. Patch 6
adds an additional test vector for ChaCha20 to actually test the AVX2 eight
block variant processing 512-bytes at once.

Patch 7 exports some poly1305-generic functionality to use it as fallback.
Patch 8 introduces a single block SSE2 driver for Poly1305, while patch 9
and 10 add an optimized two block SSE2 and a four block AVX2 variant.

Overall speedup for the ChaCha20/Poly1305 AEAD for typical IPsec payloads
is ~50-150% with SSE2/SSSE3 and ~100-200% with AVX2, or even more for larger
payloads:

poly1305-generic:
testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-generic,poly1305-generic)) encryption
test 0 (288 bit key, 16 byte blocks): 902007 operations in 1 seconds (14432112 
bytes)
test 1 (288 bit key, 64 byte blocks): 945302 operations in 1 seconds (60499328 
bytes)
test 2 (288 bit key, 256 byte blocks): 559910 operations in 1 seconds 
(143336960 bytes)
test 3 (288 bit key, 512 byte blocks): 365334 operations in 1 seconds 
(187051008 bytes)
test 4 (288 bit key, 1024 byte blocks): 213663 operations in 1 seconds 
(218790912 bytes)
test 5 (288 bit key, 2048 byte blocks): 117263 operations in 1 seconds 
(240154624 bytes)
test 6 (288 bit key, 4096 byte blocks): 61915 operations in 1 seconds 
(253603840 bytes)
test 7 (288 bit key, 8192 byte blocks): 31662 operations in 1 seconds 
(259375104 bytes)

SSE2/SSSE3:
testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-simd,poly1305-simd)) encryption
test 0 (288 bit key, 16 byte blocks): 945909 operations in 1 seconds (15134544 
bytes)
test 1 (288 bit key, 64 byte blocks): 945702 operations in 1 seconds (60524928 
bytes)
test 2 (288 bit key, 256 byte blocks): 759759 operations in 1 seconds 
(194498304 bytes)
test 3 (288 bit key, 512 byte blocks): 609356 operations in 1 seconds 
(311990272 bytes)
test 4 (288 bit key, 1024 byte blocks): 445479 operations in 1 seconds 
(456170496 bytes)
test 5 (288 bit key, 2048 byte blocks): 289479 operations in 1 seconds 
(592852992 bytes)
test 6 (288 bit key, 4096 byte blocks): 170082 operations in 1 seconds 
(696655872 bytes)
test 7 (288 bit key, 8192 byte blocks): 91443 operations in 1 seconds 
(749101056 bytes)

AVX2:
testing speed of rfc7539esp(chacha20,poly1305) 
(rfc7539esp(chacha20-simd,poly1305-simd)) encryption
test 0 (288 bit key, 16 byte blocks): 896305 operations in 1 seconds (14340880 
bytes)
test 1 (288 bit key, 64 byte blocks): 929638 operations in 1 seconds (59496832 
bytes)
test 2 (288 bit key, 256 byte blocks): 750673 operations in 1 seconds 
(192172288 bytes)
test 3 (288 bit key, 512 byte blocks): 687636 operations in 1 seconds 
(352069632 bytes)
test 4 (288 bit key, 1024 byte blocks): 555209 operations in 1 seconds 
(568534016 bytes)
test 5 (288 bit key, 2048 byte blocks): 402049 operations in 1 seconds 
(823396352 bytes)
test 6 (288 bit key, 4096 byte blocks): 259861 operations in 1 seconds 
(1064390656 bytes)
test 7 (288 bit key, 8192 byte blocks): 147283 operations in 1 seconds 
(1206542336 bytes)

All benchmark results from a Core i5-4670T.

The ChaCha20/Poly1305 AEAD on Haswell with AVX2 has about half the raw
AESNI/CLMUL-accelerated AES-GCM (rfc4106-gcm-aesni) performance for typical
IPsec MTUs. On Ivy Bridge using SSE2/SSSE3 the numbers compared to AES-GCM
are very similar due to the less efficient CLMUL instructions.

Martin Willi (10):
  crypto: tcrypt - Add ChaCha20/Poly1305 speed tests
  crypto: chacha20 - Export common ChaCha20 helpers
  crypto: chacha20 - Add a SSSE3 SIMD variant for x86_64
  crypto: chacha20 - Add a four block SSSE3 variant for x86_64
  crypto: chacha20 - Add an eight block AVX2 variant for x86_64
  crypto: testmgr - Add a longer ChaCha20 test vector
  crypto: poly1305 - Export common Poly1305 helpers
  crypto: poly1305 - Add a SSE2 SIMD variant for x86_64
  crypto: poly1305 - Add a two block SSE2 variant for x86_64
  crypto: poly1305 - Add a four block AVX2 variant for x86_64

 arch/x86/crypto/Makefile|   6 +
 arch/x86/crypto/chacha20-avx2-x86_64.S  | 443 ++
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 625 
 arch/x86/crypto/chacha20_glue.c | 150 
 arch/x86/crypto/poly1305-avx2-x86_64.S  | 386 
 arch/x86/crypto/poly1305-sse2-x86_64.S  | 582 +
 arch/x86/crypto/poly1305_glue.c | 207 +++
 crypto/Kconfig

[PATCH 03/10] crypto: chacha20 - Add a SSSE3 SIMD variant for x86_64

2015-07-07 Thread Martin Willi
Implements an x86_64 assembler driver for the ChaCha20 stream cipher. This
single block variant works on a single state matrix using SSE instructions.
It requires SSSE3 due the use of pshufb for efficient 8/16-bit rotate
operations.

For large messages, throughput increases by ~65% compared to
chacha20-generic:

testing speed of chacha20 (chacha20-generic) encryption
test 0 (256 bit key, 16 byte blocks): 4015926 operations in 1 seconds (64254816 
bytes)
test 1 (256 bit key, 64 byte blocks): 4161758 operations in 1 seconds 
(266352512 bytes)
test 2 (256 bit key, 256 byte blocks): 1223686 operations in 1 seconds 
(313263616 bytes)
test 3 (256 bit key, 1024 byte blocks): 325200 operations in 1 seconds 
(333004800 bytes)
test 4 (256 bit key, 8192 byte blocks): 40725 operations in 1 seconds 
(333619200 bytes)

testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 4154698 operations in 1 seconds (66475168 
bytes)
test 1 (256 bit key, 64 byte blocks): 4593368 operations in 1 seconds 
(293975552 bytes)
test 2 (256 bit key, 256 byte blocks): 1796194 operations in 1 seconds 
(459825664 bytes)
test 3 (256 bit key, 1024 byte blocks): 519725 operations in 1 seconds 
(532198400 bytes)
test 4 (256 bit key, 8192 byte blocks): 67132 operations in 1 seconds 
(549945344 bytes)

Benchmark results from a Core i5-4670T.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 arch/x86/crypto/Makefile|   2 +
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 142 
 arch/x86/crypto/chacha20_glue.c | 123 +++
 crypto/Kconfig  |  15 
 4 files changed, 282 insertions(+)
 create mode 100644 arch/x86/crypto/chacha20-ssse3-x86_64.S
 create mode 100644 arch/x86/crypto/chacha20_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5a4a089..b09e9a4 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -60,6 +61,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
+chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
 ifeq ($(avx_supported),yes)
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S 
b/arch/x86/crypto/chacha20-ssse3-x86_64.S
new file mode 100644
index 000..1b97ad0
--- /dev/null
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -0,0 +1,142 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include linux/linkage.h
+
+.data
+.align 16
+
+ROT8:  .octa 0x0e0d0c0f0a09080b0605040702010003
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+
+.text
+
+ENTRY(chacha20_block_xor_ssse3)
+   # %rdi: Input state matrix, s
+   # %rsi: 1 data block output, o
+   # %rdx: 1 data block input, i
+
+   # This function encrypts one ChaCha20 block by loading the state matrix
+   # in four SSE registers. It performs matrix operation on four words in
+   # parallel, but requireds shuffling to rearrange the words after each
+   # round. 8/16-bit word rotation is done with the slightly better
+   # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
+   # traditional shift+OR.
+
+   # x0..3 = s0..3
+   movdqa  0x00(%rdi),%xmm0
+   movdqa  0x10(%rdi),%xmm1
+   movdqa  0x20(%rdi),%xmm2
+   movdqa  0x30(%rdi),%xmm3
+   movdqa  %xmm0,%xmm8
+   movdqa  %xmm1,%xmm9
+   movdqa  %xmm2,%xmm10
+   movdqa  %xmm3,%xmm11
+
+   movdqa  ROT8(%rip),%xmm4
+   movdqa  ROT16(%rip),%xmm5
+
+   mov $10,%ecx
+
+.Ldoubleround:
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   paddd   %xmm1,%xmm0
+   pxor%xmm0,%xmm3
+   pshufb  %xmm5,%xmm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   paddd   %xmm3,%xmm2
+   pxor%xmm2,%xmm1
+   movdqa

[PATCH 09/10] crypto: poly1305 - Add a two block SSE2 variant for x86_64

2015-07-07 Thread Martin Willi
Extends the x86_64 SSE2 Poly1305 authenticator by a function processing two
consecutive Poly1305 blocks in parallel using a derived key r^2. Loop
unrolling can be more effectively mapped to SSE instructions, further
increasing throughput.

For large messages, throughput increases by ~45-65% compared to single
block SSE2:

testing speed of poly1305 (poly1305-simd)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 3654724 
opers/sec,  350853504 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 5939245 
opers/sec,  570167520 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9480730 
opers/sec,  910150080 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1387720 
opers/sec,  399663360 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 2031633 
opers/sec,  585110304 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3697643 
opers/sec, 1064921184 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  570658 
opers/sec,  602614848 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 1129287 
opers/sec, 1192527072 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  289452 
opers/sec,  602060160 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates):  595957 
opers/sec, 1239590560 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  303426 
opers/sec, 1252542528 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  153136 
opers/sec, 1259390464 bytes/sec

testing speed of poly1305 (poly1305-simd)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 3623907 
opers/sec,  347895072 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 5915244 
opers/sec,  567863424 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9489525 
opers/sec,  910994400 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1369657 
opers/sec,  394461216 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 2016000 
opers/sec,  580608000 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3716146 
opers/sec, 1070250048 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  561913 
opers/sec,  593380128 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 1635554 
opers/sec, 1727145024 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  286540 
opers/sec,  596003200 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates):  935339 
opers/sec, 1945505120 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  491810 
opers/sec, 2030191680 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  252857 
opers/sec, 2079495968 bytes/sec

Benchmark results from a Core i5-4670T.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 arch/x86/crypto/poly1305-sse2-x86_64.S | 306 +
 arch/x86/crypto/poly1305_glue.c|  54 +-
 2 files changed, 355 insertions(+), 5 deletions(-)

diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S 
b/arch/x86/crypto/poly1305-sse2-x86_64.S
index a3d2b5e..338c748 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -15,6 +15,7 @@
 .align 16
 
 ANMASK:.octa 0x03ff03ff
+ORMASK:.octa 0x01000100
 
 .text
 
@@ -274,3 +275,308 @@ ENTRY(poly1305_block_sse2)
pop %rbx
ret
 ENDPROC(poly1305_block_sse2)
+
+
+#define u0 0x00(%r8)
+#define u1 0x04(%r8)
+#define u2 0x08(%r8)
+#define u3 0x0c(%r8)
+#define u4 0x10(%r8)
+#define hc0 %xmm0
+#define hc1 %xmm1
+#define hc2 %xmm2
+#define hc3 %xmm5
+#define hc4 %xmm6
+#define ru0 %xmm7
+#define ru1 %xmm8
+#define ru2 %xmm9
+#define ru3 %xmm10
+#define ru4 %xmm11
+#define sv1 %xmm12
+#define sv2 %xmm13
+#define sv3 %xmm14
+#define sv4 %xmm15
+#undef d0
+#define d0 %r13
+
+ENTRY(poly1305_2block_sse2)
+   # %rdi: Accumulator h[5]
+   # %rsi: 16 byte input block m
+   # %rdx: Poly1305 key r[5]
+   # %rcx: Doubleblock count
+   # %r8:  Poly1305 derived key r^2 u[5]
+
+   # This two-block variant further improves performance by using loop
+   # unrolled block processing. This is more straight forward and does
+   # less byte shuffling, but requires a second Poly1305 key r^2:
+   # h = (h + m) * r=h = (h + m1) * r^2 + m2 * r
+
+   push%rbx
+   push%r12
+   push%r13
+
+   # combine r0,u0
+   movdu0,ru0
+   movdr0,t1
+   punpcklqdq  t1,ru0
+
+   # combine r1,u1 and s1=r1*5,v1=u1*5
+   movdu1,ru1
+   movdr1,t1
+   punpcklqdq  t1,ru1

[PATCH 10/10] crypto: poly1305 - Add a four block AVX2 variant for x86_64

2015-07-07 Thread Martin Willi
Extends the x86_64 Poly1305 authenticator by a function processing four
consecutive Poly1305 blocks in parallel using AVX2 instructions.

For large messages, throughput increases by ~15-45% compared to two
block SSE2:

testing speed of poly1305 (poly1305-simd)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 3623907 
opers/sec,  347895072 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 5915244 
opers/sec,  567863424 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9489525 
opers/sec,  910994400 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1369657 
opers/sec,  394461216 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 2016000 
opers/sec,  580608000 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3716146 
opers/sec, 1070250048 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  561913 
opers/sec,  593380128 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 1635554 
opers/sec, 1727145024 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  286540 
opers/sec,  596003200 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates):  935339 
opers/sec, 1945505120 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  491810 
opers/sec, 2030191680 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  252857 
opers/sec, 2079495968 bytes/sec

testing speed of poly1305 (poly1305-simd)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 3572538 
opers/sec,  342963648 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 5868291 
opers/sec,  563355936 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9448599 
opers/sec,  907065504 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1352271 
opers/sec,  389454048 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 1993268 
opers/sec,  574061184 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3693984 
opers/sec, 1063867392 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  555095 
opers/sec,  586180320 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 1909861 
opers/sec, 2016813216 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  282260 
opers/sec,  587100800 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates): 1221476 
opers/sec, 2540670080 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  677578 
opers/sec, 2797041984 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  364094 
opers/sec, 2994309056 bytes/sec

Benchmark results from a Core i5-4670T.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 arch/x86/crypto/Makefile   |   1 +
 arch/x86/crypto/poly1305-avx2-x86_64.S | 386 +
 arch/x86/crypto/poly1305_glue.c|  40 
 crypto/Kconfig |   2 +-
 4 files changed, 428 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5cf405c..9a2838c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -89,6 +89,7 @@ sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
 ifeq ($(avx2_supported),yes)
 sha1-ssse3-y += sha1_avx2_x86_64_asm.o
+poly1305-x86_64-y += poly1305-avx2-x86_64.o
 endif
 crc32c-intel-y := crc32c-intel_glue.o
 crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S 
b/arch/x86/crypto/poly1305-avx2-x86_64.S
new file mode 100644
index 000..eff2f41
--- /dev/null
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -0,0 +1,386 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include linux/linkage.h
+
+.data
+.align 32
+
+ANMASK:.octa 0x03ff03ff
+   .octa 0x03ff03ff
+ORMASK:.octa 0x01000100
+   .octa 0x01000100
+
+.text
+
+#define h0 0x00(%rdi)
+#define h1 0x04(%rdi)
+#define h2 0x08(%rdi)
+#define h3 0x0c(%rdi)
+#define h4 0x10(%rdi)
+#define r0 0x00(%rdx)
+#define r1 0x04(%rdx)
+#define r2 0x08(%rdx)
+#define r3 0x0c(%rdx)
+#define r4 0x10(%rdx)
+#define u0 0x00(%r8)
+#define u1 0x04(%r8)
+#define u2 0x08(%r8)
+#define u3 0x0c(%r8)
+#define u4 0x10(%r8

[PATCH 07/10] crypto: poly1305 - Export common Poly1305 helpers

2015-07-07 Thread Martin Willi
As architecture specific drivers need a software fallback, export Poly1305
init/update/final functions together with some helpers in a header file.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/chacha20poly1305.c |  4 +--
 crypto/poly1305_generic.c | 73 +++
 include/crypto/poly1305.h | 41 ++
 3 files changed, 77 insertions(+), 41 deletions(-)
 create mode 100644 include/crypto/poly1305.h

diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
index c9a36a9..1f2f8b4 100644
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
@@ -14,6 +14,7 @@
 #include crypto/internal/skcipher.h
 #include crypto/scatterwalk.h
 #include crypto/chacha20.h
+#include crypto/poly1305.h
 #include linux/err.h
 #include linux/init.h
 #include linux/kernel.h
@@ -21,9 +22,6 @@
 
 #include internal.h
 
-#define POLY1305_BLOCK_SIZE16
-#define POLY1305_DIGEST_SIZE   16
-#define POLY1305_KEY_SIZE  32
 #define CHACHAPOLY_IV_SIZE 12
 
 struct chachapoly_instance_ctx {
diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
index 387b5c8..2df9835d 100644
--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
@@ -13,31 +13,11 @@
 
 #include crypto/algapi.h
 #include crypto/internal/hash.h
+#include crypto/poly1305.h
 #include linux/crypto.h
 #include linux/kernel.h
 #include linux/module.h
 
-#define POLY1305_BLOCK_SIZE16
-#define POLY1305_KEY_SIZE  32
-#define POLY1305_DIGEST_SIZE   16
-
-struct poly1305_desc_ctx {
-   /* key */
-   u32 r[5];
-   /* finalize key */
-   u32 s[4];
-   /* accumulator */
-   u32 h[5];
-   /* partial buffer */
-   u8 buf[POLY1305_BLOCK_SIZE];
-   /* bytes used in partial buffer */
-   unsigned int buflen;
-   /* r key has been set */
-   bool rset;
-   /* s key has been set */
-   bool sset;
-};
-
 static inline u64 mlt(u64 a, u64 b)
 {
return a * b;
@@ -58,7 +38,7 @@ static inline u32 le32_to_cpuvp(const void *p)
return le32_to_cpup(p);
 }
 
-static int poly1305_init(struct shash_desc *desc)
+int crypto_poly1305_init(struct shash_desc *desc)
 {
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 
@@ -69,8 +49,9 @@ static int poly1305_init(struct shash_desc *desc)
 
return 0;
 }
+EXPORT_SYMBOL_GPL(crypto_poly1305_init);
 
-static int poly1305_setkey(struct crypto_shash *tfm,
+int crypto_poly1305_setkey(struct crypto_shash *tfm,
   const u8 *key, unsigned int keylen)
 {
/* Poly1305 requires a unique key for each tag, which implies that
@@ -79,6 +60,7 @@ static int poly1305_setkey(struct crypto_shash *tfm,
 * the update() call. */
return -ENOTSUPP;
 }
+EXPORT_SYMBOL_GPL(crypto_poly1305_setkey);
 
 static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key)
 {
@@ -98,16 +80,10 @@ static void poly1305_setskey(struct poly1305_desc_ctx 
*dctx, const u8 *key)
dctx-s[3] = le32_to_cpuvp(key + 12);
 }
 
-static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
-   const u8 *src, unsigned int srclen,
-   u32 hibit)
+unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+   const u8 *src, unsigned int srclen)
 {
-   u32 r0, r1, r2, r3, r4;
-   u32 s1, s2, s3, s4;
-   u32 h0, h1, h2, h3, h4;
-   u64 d0, d1, d2, d3, d4;
-
-   if (unlikely(!dctx-sset)) {
+   if (!dctx-sset) {
if (!dctx-rset  srclen = POLY1305_BLOCK_SIZE) {
poly1305_setrkey(dctx, src);
src += POLY1305_BLOCK_SIZE;
@@ -121,6 +97,25 @@ static unsigned int poly1305_blocks(struct 
poly1305_desc_ctx *dctx,
dctx-sset = true;
}
}
+   return srclen;
+}
+EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
+
+static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
+   const u8 *src, unsigned int srclen,
+   u32 hibit)
+{
+   u32 r0, r1, r2, r3, r4;
+   u32 s1, s2, s3, s4;
+   u32 h0, h1, h2, h3, h4;
+   u64 d0, d1, d2, d3, d4;
+   unsigned int datalen;
+
+   if (unlikely(!dctx-sset)) {
+   datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+   src += srclen - datalen;
+   srclen = datalen;
+   }
 
r0 = dctx-r[0];
r1 = dctx-r[1];
@@ -181,7 +176,7 @@ static unsigned int poly1305_blocks(struct 
poly1305_desc_ctx *dctx,
return srclen;
 }
 
-static int poly1305_update(struct shash_desc *desc,
+int crypto_poly1305_update(struct shash_desc *desc,
   const u8 *src, unsigned int srclen)
 {
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
@@ -214,8 +209,9 @@ static int poly1305_update(struct shash_desc *desc

[PATCH 05/10] crypto: chacha20 - Add an eight block AVX2 variant for x86_64

2015-07-07 Thread Martin Willi
Extends the x86_64 ChaCha20 implementation by a function processing eight
ChaCha20 blocks in parallel using AVX2.

For large messages, throughput increases by ~55-70% compared to four block
SSSE3:

testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 4164293 operations in 1 seconds (66628688 
bytes)
test 1 (256 bit key, 64 byte blocks): 4545912 operations in 1 seconds 
(290938368 bytes)
test 2 (256 bit key, 256 byte blocks): 3238241 operations in 1 seconds 
(828989696 bytes)
test 3 (256 bit key, 1024 byte blocks): 1120664 operations in 1 seconds 
(1147559936 bytes)
test 4 (256 bit key, 8192 byte blocks): 140107 operations in 1 seconds 
(1147756544 bytes)

testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 4166978 operations in 1 seconds (66671648 
bytes)
test 1 (256 bit key, 64 byte blocks): 4557525 operations in 1 seconds 
(291681600 bytes)
test 2 (256 bit key, 256 byte blocks): 3231026 operations in 1 seconds 
(827142656 bytes)
test 3 (256 bit key, 1024 byte blocks): 1929946 operations in 1 seconds 
(1976264704 bytes)
test 4 (256 bit key, 8192 byte blocks): 218037 operations in 1 seconds 
(1786159104 bytes)

Benchmark results from a Core i5-4670T.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 arch/x86/crypto/Makefile   |   1 +
 arch/x86/crypto/chacha20-avx2-x86_64.S | 443 +
 arch/x86/crypto/chacha20_glue.c|  19 ++
 crypto/Kconfig |   2 +-
 4 files changed, 464 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/crypto/chacha20-avx2-x86_64.S

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index b09e9a4..ce39b3c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -77,6 +77,7 @@ endif
 
 ifeq ($(avx2_supported),yes)
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o 
camellia_aesni_avx2_glue.o
+   chacha20-x86_64-y += chacha20-avx2-x86_64.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 endif
 
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S 
b/arch/x86/crypto/chacha20-avx2-x86_64.S
new file mode 100644
index 000..16694e6
--- /dev/null
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -0,0 +1,443 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include linux/linkage.h
+
+.data
+.align 32
+
+ROT8:  .octa 0x0e0d0c0f0a09080b0605040702010003
+   .octa 0x0e0d0c0f0a09080b0605040702010003
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+   .octa 0x0d0c0f0e09080b0a0504070601000302
+CTRINC:.octa 0x000300020001
+   .octa 0x0007000600050004
+
+.text
+
+ENTRY(chacha20_8block_xor_avx2)
+   # %rdi: Input state matrix, s
+   # %rsi: 8 data blocks output, o
+   # %rdx: 8 data blocks input, i
+
+   # This function encrypts eight consecutive ChaCha20 blocks by loading
+   # the state matrix in AVX registers eight times. As we need some
+   # scratch registers, we save the first four registers on the stack. The
+   # algorithm performs each operation on the corresponding word of each
+   # state matrix, hence requires no word shuffling. For final XORing step
+   # we transpose the matrix by interleaving 32-, 64- and then 128-bit
+   # words, which allows us to do XOR in AVX registers. 8/16-bit word
+   # rotation is done with the slightly better performing byte shuffling,
+   # 7/12-bit word rotation uses traditional shift+OR.
+
+   vzeroupper
+   # 4 * 32 byte stack, 32-byte aligned
+   mov %rsp, %r8
+   and $~31, %rsp
+   sub $0x80, %rsp
+
+   # x0..15[0-7] = s[0..15]
+   vpbroadcastd0x00(%rdi),%ymm0
+   vpbroadcastd0x04(%rdi),%ymm1
+   vpbroadcastd0x08(%rdi),%ymm2
+   vpbroadcastd0x0c(%rdi),%ymm3
+   vpbroadcastd0x10(%rdi),%ymm4
+   vpbroadcastd0x14(%rdi),%ymm5
+   vpbroadcastd0x18(%rdi),%ymm6
+   vpbroadcastd0x1c(%rdi),%ymm7
+   vpbroadcastd0x20(%rdi),%ymm8
+   vpbroadcastd0x24(%rdi),%ymm9
+   vpbroadcastd0x28(%rdi),%ymm10
+   vpbroadcastd0x2c(%rdi),%ymm11
+   vpbroadcastd0x30(%rdi),%ymm12
+   vpbroadcastd0x34(%rdi),%ymm13
+   vpbroadcastd0x38(%rdi),%ymm14
+   vpbroadcastd0x3c(%rdi),%ymm15
+   # x0..3 on stack
+   vmovdqa %ymm0,0x00(%rsp)
+   vmovdqa %ymm1,0x20(%rsp)
+   vmovdqa %ymm2,0x40(%rsp)
+   vmovdqa %ymm3,0x60(%rsp)
+
+   vmovdqa CTRINC(%rip),%ymm1
+   vmovdqa ROT8(%rip),%ymm2

[PATCH 06/10] crypto: testmgr - Add a longer ChaCha20 test vector

2015-07-07 Thread Martin Willi
The AVX2 variant of ChaCha20 is used only for messages with = 512 bytes
length. With the existing test vectors, the implementation could not be
tested. Due that lack of such a long official test vector, this one is
self-generated using chacha20-generic.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/testmgr.h | 334 ++-
 1 file changed, 333 insertions(+), 1 deletion(-)

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index b052555..b77901d 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -30180,7 +30180,7 @@ static struct cipher_testvec 
salsa20_stream_enc_tv_template[] = {
},
 };
 
-#define CHACHA20_ENC_TEST_VECTORS 3
+#define CHACHA20_ENC_TEST_VECTORS 4
 static struct cipher_testvec chacha20_enc_tv_template[] = {
{ /* RFC7539 A.2. Test Vector #1 */
.key= \x00\x00\x00\x00\x00\x00\x00\x00
@@ -30354,6 +30354,338 @@ static struct cipher_testvec 
chacha20_enc_tv_template[] = {
  \x87\xb5\x8d\xfd\x72\x8a\xfa\x36
  \x75\x7a\x79\x7a\xc1\x88\xd1,
.rlen   = 127,
+   }, { /* Self-made test vector for long data */
+   .key= \x1c\x92\x40\xa5\xeb\x55\xd3\x8a
+ \xf3\x33\x88\x86\x04\xf6\xb5\xf0
+ \x47\x39\x17\xc1\x40\x2b\x80\x09
+ \x9d\xca\x5c\xbc\x20\x70\x75\xc0,
+   .klen   = 32,
+   .iv = \x1c\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x01,
+   .input  = \x49\xee\xe0\xdc\x24\x90\x40\xcd
+ \xc5\x40\x8f\x47\x05\xbc\xdd\x81
+ \x47\xc6\x8d\xe6\xb1\x8f\xd7\xcb
+ \x09\x0e\x6e\x22\x48\x1f\xbf\xb8
+ \x5c\xf7\x1e\x8a\xc1\x23\xf2\xd4
+ \x19\x4b\x01\x0f\x4e\xa4\x43\xce
+ \x01\xc6\x67\xda\x03\x91\x18\x90
+ \xa5\xa4\x8e\x45\x03\xb3\x2d\xac
+ \x74\x92\xd3\x53\x47\xc8\xdd\x25
+ \x53\x6c\x02\x03\x87\x0d\x11\x0c
+ \x58\xe3\x12\x18\xfd\x2a\x5b\x40
+ \x0c\x30\xf0\xb8\x3f\x43\xce\xae
+ \x65\x3a\x7d\x7c\xf4\x54\xaa\xcc
+ \x33\x97\xc3\x77\xba\xc5\x70\xde
+ \xd7\xd5\x13\xa5\x65\xc4\x5f\x0f
+ \x46\x1a\x0d\x97\xb5\xf3\xbb\x3c
+ \x84\x0f\x2b\xc5\xaa\xea\xf2\x6c
+ \xc9\xb5\x0c\xee\x15\xf3\x7d\xbe
+ \x9f\x7b\x5a\xa6\xae\x4f\x83\xb6
+ \x79\x49\x41\xf4\x58\x18\xcb\x86
+ \x7f\x30\x0e\xf8\x7d\x44\x36\xea
+ \x75\xeb\x88\x84\x40\x3c\xad\x4f
+ \x6f\x31\x6b\xaa\x5d\xe5\xa5\xc5
+ \x21\x66\xe9\xa7\xe3\xb2\x15\x88
+ \x78\xf6\x79\xa1\x59\x47\x12\x4e
+ \x9f\x9f\x64\x1a\xa0\x22\x5b\x08
+ \xbe\x7c\x36\xc2\x2b\x66\x33\x1b
+ \xdd\x60\x71\xf7\x47\x8c\x61\xc3
+ \xda\x8a\x78\x1e\x16\xfa\x1e\x86
+ \x81\xa6\x17\x2a\xa7\xb5\xc2\xe7
+ \xa4\xc7\x42\xf1\xcf\x6a\xca\xb4
+ \x45\xcf\xf3\x93\xf0\xe7\xea\xf6
+ \xf4\xe6\x33\x43\x84\x93\xa5\x67
+ \x9b\x16\x58\x58\x80\x0f\x2b\x5c
+ \x24\x74\x75\x7f\x95\x81\xb7\x30
+ \x7a\x33\xa7\xf7\x94\x87\x32\x27
+ \x10\x5d\x14\x4c\x43\x29\xdd\x26
+ \xbd\x3e\x3c\x0e\xfe\x0e\xa5\x10
+ \xea\x6b\x64\xfd\x73\xc6\xed\xec
+ \xa8\xc9\xbf\xb3\xba\x0b\x4d\x07
+ \x70\xfc\x16\xfd\x79\x1e\xd7\xc5
+ \x49\x4e\x1c\x8b\x8d\x79\x1b\xb1
+ \xec\xca\x60\x09\x4c\x6a\xd5\x09
+ \x49\x46\x00\x88\x22\x8d\xce\xea
+ \xb1\x17\x11\xde\x42\xd2\x23\xc1
+ \x72\x11\xf5\x50\x73\x04\x40\x47
+ \xf9\x5d\xe7\xa7\x26\xb1\x7e\xb0
+ \x3f\x58\xc1\x52\xab\x12\x67\x9d
+ \x3f\x43\x4b\x68\xd4\x9c\x68\x38
+ \x07\x8a\x2d\x3e\xf3\xaf\x6a\x4b
+ \xf9\xe5\x31\x69\x22\xf9\xa6\x69
+ \xc6\x9c\x96\x9a\x12\x35\x95\x1d
+ \x95\xd5\xdd\xbe\xbf\x93\x53\x24
+ \xfd\xeb\xc2\x0a\x64\xb0\x77\x00
+ \x6f\x88\xc4\x37\x18\x69\x7c\xd7
+ \x41\x92\x55\x4c\x03\xa1\x9a\x4b
+ \x15\xe5\xdf\x7f\x37\x33\x72\xc1
+ \x8b\x10\x67\xa3\x01\x57

[PATCH 08/10] crypto: poly1305 - Add a SSE2 SIMD variant for x86_64

2015-07-07 Thread Martin Willi
Implements an x86_64 assembler driver for the Poly1305 authenticator. This
single block variant holds the 130-bit integer in 5 32-bit words, but uses
SSE to do two multiplications/additions in parallel.

When calling updates with small blocks, the overhead for kernel_fpu_begin/
kernel_fpu_end() negates the perfmance gain. We therefore use the
poly1305-generic fallback for small updates.

For large messages, throughput increases by ~5-10% compared to
poly1305-generic:

testing speed of poly1305 (poly1305-generic)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 3599215 
opers/sec,  345524640 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 6116275 
opers/sec,  587162400 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9608427 
opers/sec,  922408992 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1453208 
opers/sec,  418523904 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 2103440 
opers/sec,  605790720 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3691593 
opers/sec, 1063178784 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  593364 
opers/sec,  626592384 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 1074975 
opers/sec, 1135173600 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  303882 
opers/sec,  632074560 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates):  548559 
opers/sec, 1141002720 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  276084 
opers/sec, 1139674752 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  138984 
opers/sec, 1143004416 bytes/sec

testing speed of poly1305 (poly1305-simd)
test  0 (   96 byte blocks,   16 bytes per update,   6 updates): 3654724 
opers/sec,  350853504 bytes/sec
test  1 (   96 byte blocks,   32 bytes per update,   3 updates): 5939245 
opers/sec,  570167520 bytes/sec
test  2 (   96 byte blocks,   96 bytes per update,   1 updates): 9480730 
opers/sec,  910150080 bytes/sec
test  3 (  288 byte blocks,   16 bytes per update,  18 updates): 1387720 
opers/sec,  399663360 bytes/sec
test  4 (  288 byte blocks,   32 bytes per update,   9 updates): 2031633 
opers/sec,  585110304 bytes/sec
test  5 (  288 byte blocks,  288 bytes per update,   1 updates): 3697643 
opers/sec, 1064921184 bytes/sec
test  6 ( 1056 byte blocks,   32 bytes per update,  33 updates):  570658 
opers/sec,  602614848 bytes/sec
test  7 ( 1056 byte blocks, 1056 bytes per update,   1 updates): 1129287 
opers/sec, 1192527072 bytes/sec
test  8 ( 2080 byte blocks,   32 bytes per update,  65 updates):  289452 
opers/sec,  602060160 bytes/sec
test  9 ( 2080 byte blocks, 2080 bytes per update,   1 updates):  595957 
opers/sec, 1239590560 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update,   1 updates):  303426 
opers/sec, 1252542528 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update,   1 updates):  153136 
opers/sec, 1259390464 bytes/sec

Benchmark results from a Core i5-4670T.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 arch/x86/crypto/Makefile   |   2 +
 arch/x86/crypto/poly1305-sse2-x86_64.S | 276 +
 arch/x86/crypto/poly1305_glue.c| 123 +++
 crypto/Kconfig |  12 ++
 4 files changed, 413 insertions(+)
 create mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S
 create mode 100644 arch/x86/crypto/poly1305_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index ce39b3c..5cf405c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
 obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
+obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
 
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@@ -85,6 +86,7 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
 ifeq ($(avx2_supported),yes)
 sha1-ssse3-y += sha1_avx2_x86_64_asm.o
 endif
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S 
b/arch/x86/crypto/poly1305-sse2-x86_64.S
new file mode 100644
index 000..a3d2b5e
--- /dev/null
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -0,0 +1,276 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General

[PATCH 01/10] crypto: tcrypt - Add ChaCha20/Poly1305 speed tests

2015-07-07 Thread Martin Willi
Adds individual ChaCha20 and Poly1305 and a combined rfc7539esp AEAD speed
test using mode numbers 214, 321 and 213. For Poly1305 we add a specific
speed template, as it expects the key prepended to the input data.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/tcrypt.c | 15 +++
 crypto/tcrypt.h | 20 
 2 files changed, 35 insertions(+)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 9f6f10b..0e2f651 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1767,6 +1767,17 @@ static int do_test(const char *alg, u32 type, u32 mask, 
int m)
NULL, 0, 16, 8, aead_speed_template_19);
break;
 
+   case 213:
+   test_aead_speed(rfc7539esp(chacha20,poly1305), ENCRYPT, sec,
+   NULL, 0, 16, 8, aead_speed_template_36);
+   break;
+
+   case 214:
+   test_cipher_speed(chacha20, ENCRYPT, sec, NULL, 0,
+ speed_template_32);
+   break;
+
+
case 300:
if (alg) {
test_hash_speed(alg, sec, generic_hash_speed_template);
@@ -1855,6 +1866,10 @@ static int do_test(const char *alg, u32 type, u32 mask, 
int m)
test_hash_speed(crct10dif, sec, generic_hash_speed_template);
if (mode  300  mode  400) break;
 
+   case 321:
+   test_hash_speed(poly1305, sec, poly1305_speed_template);
+   if (mode  300  mode  400) break;
+
case 399:
break;
 
diff --git a/crypto/tcrypt.h b/crypto/tcrypt.h
index 6cc1b85..f0bfee1 100644
--- a/crypto/tcrypt.h
+++ b/crypto/tcrypt.h
@@ -61,12 +61,14 @@ static u8 speed_template_32_40_48[] = {32, 40, 48, 0};
 static u8 speed_template_32_48[] = {32, 48, 0};
 static u8 speed_template_32_48_64[] = {32, 48, 64, 0};
 static u8 speed_template_32_64[] = {32, 64, 0};
+static u8 speed_template_32[] = {32, 0};
 
 /*
  * AEAD speed tests
  */
 static u8 aead_speed_template_19[] = {19, 0};
 static u8 aead_speed_template_20[] = {20, 0};
+static u8 aead_speed_template_36[] = {36, 0};
 
 /*
  * Digest speed tests
@@ -127,4 +129,22 @@ static struct hash_speed hash_speed_template_16[] = {
{  .blen = 0,   .plen = 0,  .klen = 0, }
 };
 
+static struct hash_speed poly1305_speed_template[] = {
+   { .blen = 96,   .plen = 16, },
+   { .blen = 96,   .plen = 32, },
+   { .blen = 96,   .plen = 96, },
+   { .blen = 288,  .plen = 16, },
+   { .blen = 288,  .plen = 32, },
+   { .blen = 288,  .plen = 288, },
+   { .blen = 1056, .plen = 32, },
+   { .blen = 1056, .plen = 1056, },
+   { .blen = 2080, .plen = 32, },
+   { .blen = 2080, .plen = 2080, },
+   { .blen = 4128, .plen = 4128, },
+   { .blen = 8224, .plen = 8224, },
+
+   /* End marker */
+   {  .blen = 0,   .plen = 0, }
+};
+
 #endif /* _CRYPTO_TCRYPT_H */
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] crypto: poly1305 - Pass key as first two message blocks to each desc_ctx

2015-06-16 Thread Martin Willi
The Poly1305 authenticator requires a unique key for each generated tag. This
implies that we can't set the key per tfm, as multiple users set individual
keys. Instead we pass a desc specific key as the first two blocks of the
message to authenticate in update().

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/chacha20poly1305.c | 54 +++---
 crypto/poly1305_generic.c | 97 --
 crypto/testmgr.h  | 99 +--
 3 files changed, 134 insertions(+), 116 deletions(-)

diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
index 05fbc59..7b46ed7 100644
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
@@ -54,14 +54,14 @@ struct poly_req {
 };
 
 struct chacha_req {
-   /* the key we generate for Poly1305 using Chacha20 */
-   u8 key[POLY1305_KEY_SIZE];
u8 iv[CHACHA20_IV_SIZE];
struct scatterlist src[1];
struct ablkcipher_request req; /* must be last member */
 };
 
 struct chachapoly_req_ctx {
+   /* the key we generate for Poly1305 using Chacha20 */
+   u8 key[POLY1305_KEY_SIZE];
/* calculated Poly1305 tag */
u8 tag[POLY1305_DIGEST_SIZE];
/* length of data to en/decrypt, without ICV */
@@ -294,53 +294,59 @@ static int poly_ad(struct aead_request *req)
return poly_adpad(req);
 }
 
-static void poly_init_done(struct crypto_async_request *areq, int err)
+static void poly_setkey_done(struct crypto_async_request *areq, int err)
 {
async_done_continue(areq-data, err, poly_ad);
 }
 
-static int poly_init(struct aead_request *req)
+static int poly_setkey(struct aead_request *req)
 {
struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req));
struct chachapoly_req_ctx *rctx = aead_request_ctx(req);
struct poly_req *preq = rctx-u.poly;
int err;
 
+   sg_init_table(preq-src, 1);
+   sg_set_buf(preq-src, rctx-key, sizeof(rctx-key));
+
ahash_request_set_callback(preq-req, aead_request_flags(req),
-  poly_init_done, req);
+  poly_setkey_done, req);
ahash_request_set_tfm(preq-req, ctx-poly);
+   ahash_request_set_crypt(preq-req, preq-src, NULL, sizeof(rctx-key));
 
-   err = crypto_ahash_init(preq-req);
+   err = crypto_ahash_update(preq-req);
if (err)
return err;
 
return poly_ad(req);
 }
 
-static int poly_genkey_continue(struct aead_request *req)
+static void poly_init_done(struct crypto_async_request *areq, int err)
 {
-   struct crypto_aead *aead = crypto_aead_reqtfm(req);
-   struct chachapoly_ctx *ctx = crypto_aead_ctx(aead);
+   async_done_continue(areq-data, err, poly_setkey);
+}
+
+static int poly_init(struct aead_request *req)
+{
+   struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req));
struct chachapoly_req_ctx *rctx = aead_request_ctx(req);
-   struct chacha_req *creq = rctx-u.chacha;
+   struct poly_req *preq = rctx-u.poly;
int err;
 
-   crypto_ahash_clear_flags(ctx-poly, CRYPTO_TFM_REQ_MASK);
-   crypto_ahash_set_flags(ctx-poly, crypto_aead_get_flags(aead) 
-  CRYPTO_TFM_REQ_MASK);
+   ahash_request_set_callback(preq-req, aead_request_flags(req),
+  poly_init_done, req);
+   ahash_request_set_tfm(preq-req, ctx-poly);
 
-   err = crypto_ahash_setkey(ctx-poly, creq-key, sizeof(creq-key));
-   crypto_aead_set_flags(aead, crypto_ahash_get_flags(ctx-poly) 
- CRYPTO_TFM_RES_MASK);
+   err = crypto_ahash_init(preq-req);
if (err)
return err;
 
-   return poly_init(req);
+   return poly_setkey(req);
 }
 
 static void poly_genkey_done(struct crypto_async_request *areq, int err)
 {
-   async_done_continue(areq-data, err, poly_genkey_continue);
+   async_done_continue(areq-data, err, poly_init);
 }
 
 static int poly_genkey(struct aead_request *req)
@@ -351,8 +357,8 @@ static int poly_genkey(struct aead_request *req)
int err;
 
sg_init_table(creq-src, 1);
-   memset(creq-key, 0, sizeof(creq-key));
-   sg_set_buf(creq-src, creq-key, sizeof(creq-key));
+   memset(rctx-key, 0, sizeof(rctx-key));
+   sg_set_buf(creq-src, rctx-key, sizeof(rctx-key));
 
chacha_iv(creq-iv, req, 0);
 
@@ -366,7 +372,7 @@ static int poly_genkey(struct aead_request *req)
if (err)
return err;
 
-   return poly_genkey_continue(req);
+   return poly_init(req);
 }
 
 static void chacha_encrypt_done(struct crypto_async_request *areq, int err)
@@ -403,8 +409,9 @@ static int chachapoly_encrypt(struct aead_request *req)
 
/* encrypt call chain:
 * - chacha_encrypt/done()
-* - poly_genkey/done/continue()
+* - poly_genkey/done

Re: [PATCH 3/9] crypto: Add a generic Poly1305 authenticator implementation

2015-06-04 Thread Martin Willi
Herbert,

 I just realised that this doesn't quite work.  The key is shared
 by all users of the tfm, yet in your case you need it to be local

I agree, as Poly1305 uses a different key for each tag the current
approach doesn't work.

 I think the simplest solution is to make the key the beginning
 of the hashed text instead.  So the first two blocks that you
 process get used as the key.

Yes, that makes sense. I'll prepare a fix, might require some days,
though.

Thanks!
Martin

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/9] crypto: chacha20poly1305 - Add an IPsec variant for RFC7539 AEAD

2015-06-01 Thread Martin Willi
draft-ietf-ipsecme-chacha20-poly1305 defines the use of ChaCha20/Poly1305 in
ESP. It uses additional four byte key material as a salt, which is then used
with an 8 byte IV to form the ChaCha20 nonce as defined in the RFC7539.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/chacha20poly1305.c | 26 +-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
index 6171cf1..05fbc59 100644
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
@@ -627,6 +627,11 @@ static struct crypto_instance *rfc7539_alloc(struct rtattr 
**tb)
return chachapoly_alloc(tb, rfc7539, 12);
 }
 
+static struct crypto_instance *rfc7539esp_alloc(struct rtattr **tb)
+{
+   return chachapoly_alloc(tb, rfc7539esp, 8);
+}
+
 static void chachapoly_free(struct crypto_instance *inst)
 {
struct chachapoly_instance_ctx *ctx = crypto_instance_ctx(inst);
@@ -643,13 +648,31 @@ static struct crypto_template rfc7539_tmpl = {
.module = THIS_MODULE,
 };
 
+static struct crypto_template rfc7539esp_tmpl = {
+   .name = rfc7539esp,
+   .alloc = rfc7539esp_alloc,
+   .free = chachapoly_free,
+   .module = THIS_MODULE,
+};
+
 static int __init chacha20poly1305_module_init(void)
 {
-   return crypto_register_template(rfc7539_tmpl);
+   int err;
+
+   err = crypto_register_template(rfc7539_tmpl);
+   if (err)
+   return err;
+
+   err = crypto_register_template(rfc7539esp_tmpl);
+   if (err)
+   crypto_unregister_template(rfc7539_tmpl);
+
+   return err;
 }
 
 static void __exit chacha20poly1305_module_exit(void)
 {
+   crypto_unregister_template(rfc7539esp_tmpl);
crypto_unregister_template(rfc7539_tmpl);
 }
 
@@ -661,3 +684,4 @@ MODULE_AUTHOR(Martin Willi mar...@strongswan.org);
 MODULE_DESCRIPTION(ChaCha20-Poly1305 AEAD);
 MODULE_ALIAS_CRYPTO(chacha20poly1305);
 MODULE_ALIAS_CRYPTO(rfc7539);
+MODULE_ALIAS_CRYPTO(rfc7539esp);
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/9] crypto: testmgr - Add ChaCha20-Poly1305 test vectors from RFC7539

2015-06-01 Thread Martin Willi
Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/testmgr.c |  15 
 crypto/testmgr.h | 269 +++
 2 files changed, 284 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index faf93a6..915a9ef 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3387,6 +3387,21 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}, {
+   .alg = rfc7539(chacha20,poly1305),
+   .test = alg_test_aead,
+   .suite = {
+   .aead = {
+   .enc = {
+   .vecs = rfc7539_enc_tv_template,
+   .count = RFC7539_ENC_TEST_VECTORS
+   },
+   .dec = {
+   .vecs = rfc7539_dec_tv_template,
+   .count = RFC7539_DEC_TEST_VECTORS
+   },
+   }
+   }
+   }, {
.alg = rmd128,
.test = alg_test_hash,
.suite = {
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 0cbb718..be0caf5 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -21020,6 +21020,275 @@ static struct aead_testvec 
aes_ccm_rfc4309_dec_tv_template[] = {
 };
 
 /*
+ * ChaCha20-Poly1305 AEAD test vectors from RFC7539 2.8.2./A.5.
+ */
+#define RFC7539_ENC_TEST_VECTORS 2
+#define RFC7539_DEC_TEST_VECTORS 2
+static struct aead_testvec rfc7539_enc_tv_template[] = {
+   {
+   .key= \x80\x81\x82\x83\x84\x85\x86\x87
+ \x88\x89\x8a\x8b\x8c\x8d\x8e\x8f
+ \x90\x91\x92\x93\x94\x95\x96\x97
+ \x98\x99\x9a\x9b\x9c\x9d\x9e\x9f,
+   .klen   = 32,
+   .iv = \x07\x00\x00\x00\x40\x41\x42\x43
+ \x44\x45\x46\x47,
+   .assoc  = \x50\x51\x52\x53\xc0\xc1\xc2\xc3
+ \xc4\xc5\xc6\xc7,
+   .alen   = 12,
+   .input  = \x4c\x61\x64\x69\x65\x73\x20\x61
+ \x6e\x64\x20\x47\x65\x6e\x74\x6c
+ \x65\x6d\x65\x6e\x20\x6f\x66\x20
+ \x74\x68\x65\x20\x63\x6c\x61\x73
+ \x73\x20\x6f\x66\x20\x27\x39\x39
+ \x3a\x20\x49\x66\x20\x49\x20\x63
+ \x6f\x75\x6c\x64\x20\x6f\x66\x66
+ \x65\x72\x20\x79\x6f\x75\x20\x6f
+ \x6e\x6c\x79\x20\x6f\x6e\x65\x20
+ \x74\x69\x70\x20\x66\x6f\x72\x20
+ \x74\x68\x65\x20\x66\x75\x74\x75
+ \x72\x65\x2c\x20\x73\x75\x6e\x73
+ \x63\x72\x65\x65\x6e\x20\x77\x6f
+ \x75\x6c\x64\x20\x62\x65\x20\x69
+ \x74\x2e,
+   .ilen   = 114,
+   .result = \xd3\x1a\x8d\x34\x64\x8e\x60\xdb
+ \x7b\x86\xaf\xbc\x53\xef\x7e\xc2
+ \xa4\xad\xed\x51\x29\x6e\x08\xfe
+ \xa9\xe2\xb5\xa7\x36\xee\x62\xd6
+ \x3d\xbe\xa4\x5e\x8c\xa9\x67\x12
+ \x82\xfa\xfb\x69\xda\x92\x72\x8b
+ \x1a\x71\xde\x0a\x9e\x06\x0b\x29
+ \x05\xd6\xa5\xb6\x7e\xcd\x3b\x36
+ \x92\xdd\xbd\x7f\x2d\x77\x8b\x8c
+ \x98\x03\xae\xe3\x28\x09\x1b\x58
+ \xfa\xb3\x24\xe4\xfa\xd6\x75\x94
+ \x55\x85\x80\x8b\x48\x31\xd7\xbc
+ \x3f\xf4\xde\xf0\x8e\x4b\x7a\x9d
+ \xe5\x76\xd2\x65\x86\xce\xc6\x4b
+ \x61\x16\x1a\xe1\x0b\x59\x4f\x09
+ \xe2\x6a\x7e\x90\x2e\xcb\xd0\x60
+ \x06\x91,
+   .rlen   = 130,
+   }, {
+   .key= \x1c\x92\x40\xa5\xeb\x55\xd3\x8a
+ \xf3\x33\x88\x86\x04\xf6\xb5\xf0
+ \x47\x39\x17\xc1\x40\x2b\x80\x09
+ \x9d\xca\x5c\xbc\x20\x70\x75\xc0,
+   .klen   = 32,
+   .iv = \x00\x00\x00\x00\x01\x02\x03\x04
+ \x05\x06\x07\x08,
+   .assoc  = \xf3\x33\x88\x86\x00\x00\x00\x00
+ \x00\x00\x4e\x91,
+   .alen   = 12,
+   .input  = \x49\x6e\x74\x65\x72\x6e\x65\x74
+ \x2d\x44\x72\x61\x66\x74\x73\x20
+ \x61\x72\x65\x20\x64\x72\x61\x66
+ \x74\x20\x64\x6f\x63\x75\x6d\x65
+ \x6e\x74\x73\x20\x76\x61\x6c\x69
+ \x64\x20\x66\x6f\x72\x20\x61\x20
+ \x6d\x61\x78\x69\x6d\x75\x6d\x20

[PATCH 5/9] crypto: Add a ChaCha20-Poly1305 AEAD construction, RFC7539

2015-06-01 Thread Martin Willi
This AEAD uses a chacha20 ablkcipher and a poly1305 ahash to construct the
ChaCha20-Poly1305 AEAD as defined in RFC7539. It supports both synchronous and
asynchronous operations, even if we currently have no async chacha20 or poly1305
drivers.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/Kconfig|  12 +
 crypto/Makefile   |   1 +
 crypto/chacha20poly1305.c | 663 ++
 3 files changed, 676 insertions(+)
 create mode 100644 crypto/chacha20poly1305.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 5fef005..ce53ab1a 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -217,6 +217,18 @@ config CRYPTO_GCM
  Support for Galois/Counter Mode (GCM) and Galois Message
  Authentication Code (GMAC). Required for IPSec.
 
+config CRYPTO_CHACHA20POLY1305
+   tristate ChaCha20-Poly1305 AEAD support
+   select CRYPTO_CHACHA20
+   select CRYPTO_POLY1305
+   select CRYPTO_AEAD
+   help
+ ChaCha20-Poly1305 AEAD support, RFC7539.
+
+ Support for the AEAD wrapper using the ChaCha20 stream cipher combined
+ with the Poly1305 authenticator. It is defined in RFC7539 for use in
+ IETF protocols.
+
 config CRYPTO_SEQIV
tristate Sequence Number IV Generator
select CRYPTO_AEAD
diff --git a/crypto/Makefile b/crypto/Makefile
index 2424c81..e6cf6a5 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_CRYPTO_XTS) += xts.o
 obj-$(CONFIG_CRYPTO_CTR) += ctr.o
 obj-$(CONFIG_CRYPTO_GCM) += gcm.o
 obj-$(CONFIG_CRYPTO_CCM) += ccm.o
+obj-$(CONFIG_CRYPTO_CHACHA20POLY1305) += chacha20poly1305.o
 obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o
 obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o
 obj-$(CONFIG_CRYPTO_MCRYPTD) += mcryptd.o
diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
new file mode 100644
index 000..6171cf1
--- /dev/null
+++ b/crypto/chacha20poly1305.c
@@ -0,0 +1,663 @@
+/*
+ * ChaCha20-Poly1305 AEAD, RFC7539
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include crypto/internal/aead.h
+#include crypto/internal/hash.h
+#include crypto/internal/skcipher.h
+#include crypto/scatterwalk.h
+#include linux/err.h
+#include linux/init.h
+#include linux/kernel.h
+#include linux/module.h
+
+#include internal.h
+
+#define POLY1305_BLOCK_SIZE16
+#define POLY1305_DIGEST_SIZE   16
+#define POLY1305_KEY_SIZE  32
+#define CHACHA20_KEY_SIZE  32
+#define CHACHA20_IV_SIZE   16
+#define CHACHAPOLY_IV_SIZE 12
+
+struct chachapoly_instance_ctx {
+   struct crypto_skcipher_spawn chacha;
+   struct crypto_ahash_spawn poly;
+   unsigned int saltlen;
+};
+
+struct chachapoly_ctx {
+   struct crypto_ablkcipher *chacha;
+   struct crypto_ahash *poly;
+   /* key bytes we use for the ChaCha20 IV */
+   unsigned int saltlen;
+   u8 salt[];
+};
+
+struct poly_req {
+   /* zero byte padding for AD/ciphertext, as needed */
+   u8 pad[POLY1305_BLOCK_SIZE];
+   /* tail data with AD/ciphertext lengths */
+   struct {
+   __le64 assoclen;
+   __le64 cryptlen;
+   } tail;
+   struct scatterlist src[1];
+   struct ahash_request req; /* must be last member */
+};
+
+struct chacha_req {
+   /* the key we generate for Poly1305 using Chacha20 */
+   u8 key[POLY1305_KEY_SIZE];
+   u8 iv[CHACHA20_IV_SIZE];
+   struct scatterlist src[1];
+   struct ablkcipher_request req; /* must be last member */
+};
+
+struct chachapoly_req_ctx {
+   /* calculated Poly1305 tag */
+   u8 tag[POLY1305_DIGEST_SIZE];
+   /* length of data to en/decrypt, without ICV */
+   unsigned int cryptlen;
+   union {
+   struct poly_req poly;
+   struct chacha_req chacha;
+   } u;
+};
+
+static inline void async_done_continue(struct aead_request *req, int err,
+  int (*cont)(struct aead_request *))
+{
+   if (!err)
+   err = cont(req);
+
+   if (err != -EINPROGRESS  err != -EBUSY)
+   aead_request_complete(req, err);
+}
+
+static void chacha_iv(u8 *iv, struct aead_request *req, u32 icb)
+{
+   struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req));
+   __le32 leicb = cpu_to_le32(icb);
+
+   memcpy(iv, leicb, sizeof(leicb));
+   memcpy(iv + sizeof(leicb), ctx-salt, ctx-saltlen);
+   memcpy(iv + sizeof(leicb) + ctx-saltlen, req-iv,
+  CHACHA20_IV_SIZE - sizeof(leicb) - ctx-saltlen);
+}
+
+static int poly_verify_tag(struct aead_request *req)
+{
+   struct chachapoly_req_ctx *rctx = aead_request_ctx(req);
+   u8 tag[sizeof(rctx-tag)];
+
+   scatterwalk_map_and_copy

[PATCH 2/9] crypto: testmgr - Add ChaCha20 test vectors from RFC7539

2015-06-01 Thread Martin Willi
We explicitly set the Initial block Counter by prepending it to the nonce in
Little Endian. The same test vector is used for both encryption and decryption,
ChaCha20 is a cipher XORing a keystream.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/testmgr.c |  15 +
 crypto/testmgr.h | 177 +++
 2 files changed, 192 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 717d6f2..abd09c2 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -2308,6 +2308,21 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}, {
+   .alg = chacha20,
+   .test = alg_test_skcipher,
+   .suite = {
+   .cipher = {
+   .enc = {
+   .vecs = chacha20_enc_tv_template,
+   .count = CHACHA20_ENC_TEST_VECTORS
+   },
+   .dec = {
+   .vecs = chacha20_enc_tv_template,
+   .count = CHACHA20_ENC_TEST_VECTORS
+   },
+   }
+   }
+   }, {
.alg = cmac(aes),
.test = alg_test_hash,
.suite = {
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 6003143..6e7b902 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -28427,6 +28427,183 @@ static struct cipher_testvec 
salsa20_stream_enc_tv_template[] = {
},
 };
 
+#define CHACHA20_ENC_TEST_VECTORS 3
+static struct cipher_testvec chacha20_enc_tv_template[] = {
+   { /* RFC7539 A.2. Test Vector #1 */
+   .key= \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00,
+   .klen   = 32,
+   .iv = \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00,
+   .input  = \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00,
+   .ilen   = 64,
+   .result = \x76\xb8\xe0\xad\xa0\xf1\x3d\x90
+ \x40\x5d\x6a\xe5\x53\x86\xbd\x28
+ \xbd\xd2\x19\xb8\xa0\x8d\xed\x1a
+ \xa8\x36\xef\xcc\x8b\x77\x0d\xc7
+ \xda\x41\x59\x7c\x51\x57\x48\x8d
+ \x77\x24\xe0\x3f\xb8\xd8\x4a\x37
+ \x6a\x43\xb8\xf4\x15\x18\xa1\x1c
+ \xc3\x87\xb6\x69\xb2\xee\x65\x86,
+   .rlen   = 64,
+   }, { /* RFC7539 A.2. Test Vector #2 */
+   .key= \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x01,
+   .klen   = 32,
+   .iv = \x01\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x02,
+   .input  = \x41\x6e\x79\x20\x73\x75\x62\x6d
+ \x69\x73\x73\x69\x6f\x6e\x20\x74
+ \x6f\x20\x74\x68\x65\x20\x49\x45
+ \x54\x46\x20\x69\x6e\x74\x65\x6e
+ \x64\x65\x64\x20\x62\x79\x20\x74
+ \x68\x65\x20\x43\x6f\x6e\x74\x72
+ \x69\x62\x75\x74\x6f\x72\x20\x66
+ \x6f\x72\x20\x70\x75\x62\x6c\x69
+ \x63\x61\x74\x69\x6f\x6e\x20\x61
+ \x73\x20\x61\x6c\x6c\x20\x6f\x72
+ \x20\x70\x61\x72\x74\x20\x6f\x66
+ \x20\x61\x6e\x20\x49\x45\x54\x46
+ \x20\x49\x6e\x74\x65\x72\x6e\x65
+ \x74\x2d\x44\x72\x61\x66\x74\x20
+ \x6f\x72\x20\x52\x46\x43\x20\x61
+ \x6e\x64\x20\x61\x6e\x79\x20\x73
+ \x74\x61\x74\x65\x6d\x65\x6e\x74
+ \x20\x6d\x61\x64\x65\x20\x77\x69
+ \x74\x68\x69\x6e\x20\x74\x68\x65
+ \x20\x63\x6f\x6e\x74\x65\x78\x74
+ \x20\x6f\x66\x20\x61\x6e\x20\x49
+ \x45\x54\x46\x20\x61\x63\x74\x69
+ \x76\x69\x74\x79\x20\x69\x73\x20
+ \x63

[PATCH 1/9] crypto: Add a generic ChaCha20 stream cipher implementation

2015-06-01 Thread Martin Willi
ChaCha20 is a high speed 256-bit key size stream cipher algorithm designed by
Daniel J. Bernstein. It is further specified in RFC7539 for use in IETF
protocols as a building block for the ChaCha20-Poly1305 AEAD.

This is a portable C implementation without any architecture specific
optimizations. It uses a 16-byte IV, which includes the 12-byte ChaCha20 nonce
prepended by the initial block counter. Some algorithms require an explicit
counter value, for example the mentioned AEAD construction.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/Kconfig|  13 +++
 crypto/Makefile   |   1 +
 crypto/chacha20_generic.c | 216 ++
 3 files changed, 230 insertions(+)
 create mode 100644 crypto/chacha20_generic.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 0ff4cd4..5407e8f 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1156,6 +1156,19 @@ config CRYPTO_SALSA20_X86_64
  The Salsa20 stream cipher algorithm is designed by Daniel J.
  Bernstein d...@cr.yp.to. See http://cr.yp.to/snuffle.html
 
+config CRYPTO_CHACHA20
+   tristate ChaCha20 cipher algorithm
+   select CRYPTO_BLKCIPHER
+   help
+ ChaCha20 cipher algorithm, RFC7539.
+
+ ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J.
+ Bernstein and further specified in RFC7539 for use in IETF protocols.
+ This is the portable C implementation of ChaCha20.
+
+ See also:
+ http://cr.yp.to/chacha/chacha-20080128.pdf
+
 config CRYPTO_SEED
tristate SEED cipher algorithm
select CRYPTO_ALGAPI
diff --git a/crypto/Makefile b/crypto/Makefile
index 5db5b95..be87ec1 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -80,6 +80,7 @@ obj-$(CONFIG_CRYPTO_KHAZAD) += khazad.o
 obj-$(CONFIG_CRYPTO_ANUBIS) += anubis.o
 obj-$(CONFIG_CRYPTO_SEED) += seed.o
 obj-$(CONFIG_CRYPTO_SALSA20) += salsa20_generic.o
+obj-$(CONFIG_CRYPTO_CHACHA20) += chacha20_generic.o
 obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o
 obj-$(CONFIG_CRYPTO_ZLIB) += zlib.o
 obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
new file mode 100644
index 000..fa42e70
--- /dev/null
+++ b/crypto/chacha20_generic.c
@@ -0,0 +1,216 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include crypto/algapi.h
+#include linux/crypto.h
+#include linux/kernel.h
+#include linux/module.h
+
+#define CHACHA20_NONCE_SIZE 16
+#define CHACHA20_KEY_SIZE   32
+#define CHACHA20_BLOCK_SIZE 64
+
+struct chacha20_ctx {
+   u32 key[8];
+};
+
+static inline u32 rotl32(u32 v, u8 n)
+{
+   return (v  n) | (v  (sizeof(v) * 8 - n));
+}
+
+static inline u32 le32_to_cpuvp(const void *p)
+{
+   return le32_to_cpup(p);
+}
+
+static void chacha20_block(u32 *state, void *stream)
+{
+   u32 x[16], *out = stream;
+   int i;
+
+   for (i = 0; i  ARRAY_SIZE(x); i++)
+   x[i] = state[i];
+
+   for (i = 0; i  20; i += 2) {
+   x[0]  += x[4];x[12] = rotl32(x[12] ^ x[0],  16);
+   x[1]  += x[5];x[13] = rotl32(x[13] ^ x[1],  16);
+   x[2]  += x[6];x[14] = rotl32(x[14] ^ x[2],  16);
+   x[3]  += x[7];x[15] = rotl32(x[15] ^ x[3],  16);
+
+   x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],  12);
+   x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],  12);
+   x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10], 12);
+   x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11], 12);
+
+   x[0]  += x[4];x[12] = rotl32(x[12] ^ x[0],   8);
+   x[1]  += x[5];x[13] = rotl32(x[13] ^ x[1],   8);
+   x[2]  += x[6];x[14] = rotl32(x[14] ^ x[2],   8);
+   x[3]  += x[7];x[15] = rotl32(x[15] ^ x[3],   8);
+
+   x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],   7);
+   x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],   7);
+   x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10],  7);
+   x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11],  7);
+
+   x[0]  += x[5];x[15] = rotl32(x[15] ^ x[0],  16);
+   x[1]  += x[6];x[12] = rotl32(x[12] ^ x[1],  16);
+   x[2]  += x[7];x[13] = rotl32(x[13] ^ x[2],  16);
+   x[3]  += x[4];x[14] = rotl32(x[14] ^ x[3],  16);
+
+   x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10], 12);
+   x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11], 12);
+   x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],  12);
+   x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],  12

[PATCH 9/9] xfrm: Define ChaCha20-Poly1305 AEAD XFRM algo for IPsec users

2015-06-01 Thread Martin Willi
Signed-off-by: Martin Willi mar...@strongswan.org
---
 net/xfrm/xfrm_algo.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index 67266b7..42f7c76 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -159,6 +159,18 @@ static struct xfrm_algo_desc aead_list[] = {
.sadb_alg_maxbits = 256
}
 },
+{
+   .name = rfc7539esp(chacha20,poly1305),
+
+   .uinfo = {
+   .aead = {
+   .geniv = seqniv,
+   .icv_truncbits = 128,
+   }
+   },
+
+   .pfkey_supported = 0,
+},
 };
 
 static struct xfrm_algo_desc aalg_list[] = {
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/9] crypto: testmgr - Add Poly1305 test vectors from RFC7539

2015-06-01 Thread Martin Willi
Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/testmgr.c |   9 ++
 crypto/testmgr.h | 259 +++
 2 files changed, 268 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index abd09c2..faf93a6 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3315,6 +3315,15 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}, {
+   .alg = poly1305,
+   .test = alg_test_hash,
+   .suite = {
+   .hash = {
+   .vecs = poly1305_tv_template,
+   .count = POLY1305_TEST_VECTORS
+   }
+   }
+   }, {
.alg = rfc3686(ctr(aes)),
.test = alg_test_skcipher,
.fips_allowed = 1,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 6e7b902..0cbb718 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -3026,6 +3026,265 @@ static struct hash_testvec hmac_sha512_tv_template[] = {
 };
 
 /*
+ * Poly1305 test vectors from RFC7539 A.3.
+ */
+
+#define POLY1305_TEST_VECTORS  11
+
+static struct hash_testvec poly1305_tv_template[] = {
+   { /* Test Vector #1 */
+   .key= \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00,
+   .ksize  = 32,
+   .plaintext  = \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00,
+   .psize  = 64,
+   .digest = \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00,
+   }, { /* Test Vector #2 */
+   .key= \x00\x00\x00\x00\x00\x00\x00\x00
+ \x00\x00\x00\x00\x00\x00\x00\x00
+ \x36\xe5\xf6\xb5\xc5\xe0\x60\x70
+ \xf0\xef\xca\x96\x22\x7a\x86\x3e,
+   .ksize  = 32,
+   .plaintext  = \x41\x6e\x79\x20\x73\x75\x62\x6d
+ \x69\x73\x73\x69\x6f\x6e\x20\x74
+ \x6f\x20\x74\x68\x65\x20\x49\x45
+ \x54\x46\x20\x69\x6e\x74\x65\x6e
+ \x64\x65\x64\x20\x62\x79\x20\x74
+ \x68\x65\x20\x43\x6f\x6e\x74\x72
+ \x69\x62\x75\x74\x6f\x72\x20\x66
+ \x6f\x72\x20\x70\x75\x62\x6c\x69
+ \x63\x61\x74\x69\x6f\x6e\x20\x61
+ \x73\x20\x61\x6c\x6c\x20\x6f\x72
+ \x20\x70\x61\x72\x74\x20\x6f\x66
+ \x20\x61\x6e\x20\x49\x45\x54\x46
+ \x20\x49\x6e\x74\x65\x72\x6e\x65
+ \x74\x2d\x44\x72\x61\x66\x74\x20
+ \x6f\x72\x20\x52\x46\x43\x20\x61
+ \x6e\x64\x20\x61\x6e\x79\x20\x73
+ \x74\x61\x74\x65\x6d\x65\x6e\x74
+ \x20\x6d\x61\x64\x65\x20\x77\x69
+ \x74\x68\x69\x6e\x20\x74\x68\x65
+ \x20\x63\x6f\x6e\x74\x65\x78\x74
+ \x20\x6f\x66\x20\x61\x6e\x20\x49
+ \x45\x54\x46\x20\x61\x63\x74\x69
+ \x76\x69\x74\x79\x20\x69\x73\x20
+ \x63\x6f\x6e\x73\x69\x64\x65\x72
+ \x65\x64\x20\x61\x6e\x20\x22\x49
+ \x45\x54\x46\x20\x43\x6f\x6e\x74
+ \x72\x69\x62\x75\x74\x69\x6f\x6e
+ \x22\x2e\x20\x53\x75\x63\x68\x20
+ \x73\x74\x61\x74\x65\x6d\x65\x6e
+ \x74\x73\x20\x69\x6e\x63\x6c\x75
+ \x64\x65\x20\x6f\x72\x61\x6c\x20
+ \x73\x74\x61\x74\x65\x6d\x65\x6e
+ \x74\x73\x20\x69\x6e\x20\x49\x45
+ \x54\x46\x20\x73\x65\x73\x73\x69
+ \x6f\x6e\x73\x2c\x20\x61\x73\x20

[PATCH 8/9] crypto: testmgr - Add draft-ietf-ipsecme-chacha20-poly1305 test vector

2015-06-01 Thread Martin Willi
Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/testmgr.c |  15 +
 crypto/testmgr.h | 179 +++
 2 files changed, 194 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 915a9ef..ccd19cf 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3402,6 +3402,21 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}, {
+   .alg = rfc7539esp(chacha20,poly1305),
+   .test = alg_test_aead,
+   .suite = {
+   .aead = {
+   .enc = {
+   .vecs = rfc7539esp_enc_tv_template,
+   .count = RFC7539ESP_ENC_TEST_VECTORS
+   },
+   .dec = {
+   .vecs = rfc7539esp_dec_tv_template,
+   .count = RFC7539ESP_DEC_TEST_VECTORS
+   },
+   }
+   }
+   }, {
.alg = rmd128,
.test = alg_test_hash,
.suite = {
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index be0caf5..4d98e40 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -21289,6 +21289,185 @@ static struct aead_testvec rfc7539_dec_tv_template[] 
= {
 };
 
 /*
+ * draft-irtf-cfrg-chacha20-poly1305
+ */
+#define RFC7539ESP_DEC_TEST_VECTORS 1
+#define RFC7539ESP_ENC_TEST_VECTORS 1
+static struct aead_testvec rfc7539esp_enc_tv_template[] = {
+   {
+   .key= \x1c\x92\x40\xa5\xeb\x55\xd3\x8a
+ \xf3\x33\x88\x86\x04\xf6\xb5\xf0
+ \x47\x39\x17\xc1\x40\x2b\x80\x09
+ \x9d\xca\x5c\xbc\x20\x70\x75\xc0
+ \x00\x00\x00\x00,
+   .klen   = 36,
+   .iv = \x01\x02\x03\x04\x05\x06\x07\x08,
+   .assoc  = \xf3\x33\x88\x86\x00\x00\x00\x00
+ \x00\x00\x4e\x91,
+   .alen   = 12,
+   .input  = \x49\x6e\x74\x65\x72\x6e\x65\x74
+ \x2d\x44\x72\x61\x66\x74\x73\x20
+ \x61\x72\x65\x20\x64\x72\x61\x66
+ \x74\x20\x64\x6f\x63\x75\x6d\x65
+ \x6e\x74\x73\x20\x76\x61\x6c\x69
+ \x64\x20\x66\x6f\x72\x20\x61\x20
+ \x6d\x61\x78\x69\x6d\x75\x6d\x20
+ \x6f\x66\x20\x73\x69\x78\x20\x6d
+ \x6f\x6e\x74\x68\x73\x20\x61\x6e
+ \x64\x20\x6d\x61\x79\x20\x62\x65
+ \x20\x75\x70\x64\x61\x74\x65\x64
+ \x2c\x20\x72\x65\x70\x6c\x61\x63
+ \x65\x64\x2c\x20\x6f\x72\x20\x6f
+ \x62\x73\x6f\x6c\x65\x74\x65\x64
+ \x20\x62\x79\x20\x6f\x74\x68\x65
+ \x72\x20\x64\x6f\x63\x75\x6d\x65
+ \x6e\x74\x73\x20\x61\x74\x20\x61
+ \x6e\x79\x20\x74\x69\x6d\x65\x2e
+ \x20\x49\x74\x20\x69\x73\x20\x69
+ \x6e\x61\x70\x70\x72\x6f\x70\x72
+ \x69\x61\x74\x65\x20\x74\x6f\x20
+ \x75\x73\x65\x20\x49\x6e\x74\x65
+ \x72\x6e\x65\x74\x2d\x44\x72\x61
+ \x66\x74\x73\x20\x61\x73\x20\x72
+ \x65\x66\x65\x72\x65\x6e\x63\x65
+ \x20\x6d\x61\x74\x65\x72\x69\x61
+ \x6c\x20\x6f\x72\x20\x74\x6f\x20
+ \x63\x69\x74\x65\x20\x74\x68\x65
+ \x6d\x20\x6f\x74\x68\x65\x72\x20
+ \x74\x68\x61\x6e\x20\x61\x73\x20
+ \x2f\xe2\x80\x9c\x77\x6f\x72\x6b
+ \x20\x69\x6e\x20\x70\x72\x6f\x67
+ \x72\x65\x73\x73\x2e\x2f\xe2\x80
+ \x9d,
+   .ilen   = 265,
+   .result = \x64\xa0\x86\x15\x75\x86\x1a\xf4
+ \x60\xf0\x62\xc7\x9b\xe6\x43\xbd
+ \x5e\x80\x5c\xfd\x34\x5c\xf3\x89
+ \xf1\x08\x67\x0a\xc7\x6c\x8c\xb2
+ \x4c\x6c\xfc\x18\x75\x5d\x43\xee
+ \xa0\x9e\xe9\x4e\x38\x2d\x26\xb0
+ \xbd\xb7\xb7\x3c\x32\x1b\x01\x00
+ \xd4\xf0\x3b\x7f\x35\x58\x94\xcf
+ \x33\x2f\x83\x0e\x71\x0b\x97\xce
+ \x98\xc8\xa8\x4a\xbd\x0b\x94\x81
+ \x14\xad\x17\x6e\x00\x8d\x33\xbd
+ \x60\xf9\x82\xb1\xff\x37\xc8\x55
+ \x97\x97\xa0\x6e\xf4\xf0\xef\x61
+ \xc1\x86\x32\x4e\x2b\x35\x06\x38

[PATCH 3/9] crypto: Add a generic Poly1305 authenticator implementation

2015-06-01 Thread Martin Willi
Poly1305 is a fast message authenticator designed by Daniel J. Bernstein.
It is further defined in RFC7539 as a building block for the ChaCha20-Poly1305
AEAD for use in IETF protocols.

This is a portable C implementation of the algorithm without architecture
specific optimizations, based on public domain code by Daniel J. Bernstein and
Andrew Moon.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 crypto/Kconfig|   9 ++
 crypto/Makefile   |   1 +
 crypto/poly1305_generic.c | 300 ++
 3 files changed, 310 insertions(+)
 create mode 100644 crypto/poly1305_generic.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 5407e8f..5fef005 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -426,6 +426,15 @@ config CRYPTO_GHASH
help
  GHASH is message digest algorithm for GCM (Galois/Counter Mode).
 
+config CRYPTO_POLY1305
+   tristate Poly1305 authenticator algorithm
+   help
+ Poly1305 authenticator algorithm, RFC7539.
+
+ Poly1305 is an authenticator algorithm designed by Daniel J. 
Bernstein.
+ It is used for the ChaCha20-Poly1305 AEAD, specified in RFC7539 for 
use
+ in IETF protocols. This is the portable C implementation of Poly1305.
+
 config CRYPTO_MD4
tristate MD4 digest algorithm
select CRYPTO_HASH
diff --git a/crypto/Makefile b/crypto/Makefile
index be87ec1..2424c81 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_CRYPTO_ANUBIS) += anubis.o
 obj-$(CONFIG_CRYPTO_SEED) += seed.o
 obj-$(CONFIG_CRYPTO_SALSA20) += salsa20_generic.o
 obj-$(CONFIG_CRYPTO_CHACHA20) += chacha20_generic.o
+obj-$(CONFIG_CRYPTO_POLY1305) += poly1305_generic.o
 obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o
 obj-$(CONFIG_CRYPTO_ZLIB) += zlib.o
 obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
new file mode 100644
index 000..9c1159b
--- /dev/null
+++ b/crypto/poly1305_generic.c
@@ -0,0 +1,300 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * Based on public domain code by Andrew Moon and Daniel J. Bernstein.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include crypto/algapi.h
+#include crypto/internal/hash.h
+#include linux/crypto.h
+#include linux/kernel.h
+#include linux/module.h
+
+#define POLY1305_BLOCK_SIZE16
+#define POLY1305_KEY_SIZE  32
+#define POLY1305_DIGEST_SIZE   16
+
+struct poly1305_ctx {
+   /* key */
+   u32 r[5];
+   /* finalize key */
+   u32 s[4];
+};
+
+struct poly1305_desc_ctx {
+   /* accumulator */
+   u32 h[5];
+   /* partial buffer */
+   u8 buf[POLY1305_BLOCK_SIZE];
+   /* bytes used in partial buffer */
+   unsigned int buflen;
+};
+
+static inline u64 mlt(u64 a, u64 b)
+{
+   return a * b;
+}
+
+static inline u32 sr(u64 v, u_char n)
+{
+   return v  n;
+}
+
+static inline u32 and(u32 v, u32 mask)
+{
+   return v  mask;
+}
+
+static inline u32 le32_to_cpuvp(const void *p)
+{
+   return le32_to_cpup(p);
+}
+
+static int poly1305_init(struct shash_desc *desc)
+{
+   struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+   memset(dctx-h, 0, sizeof(dctx-h));
+   dctx-buflen = 0;
+
+   return 0;
+}
+
+static int poly1305_setkey(struct crypto_shash *tfm,
+  const u8 *key, unsigned int keylen)
+{
+   struct poly1305_ctx *ctx = crypto_shash_ctx(tfm);
+
+   if (keylen != POLY1305_KEY_SIZE) {
+   crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+   return -EINVAL;
+   }
+
+   /* r = 0xffc0ffc0ffc0fff */
+   ctx-r[0] = (le32_to_cpuvp(key +  0)  0)  0x3ff;
+   ctx-r[1] = (le32_to_cpuvp(key +  3)  2)  0x303;
+   ctx-r[2] = (le32_to_cpuvp(key +  6)  4)  0x3ffc0ff;
+   ctx-r[3] = (le32_to_cpuvp(key +  9)  6)  0x3f03fff;
+   ctx-r[4] = (le32_to_cpuvp(key + 12)  8)  0x00f;
+
+   ctx-s[0] = le32_to_cpuvp(key + 16);
+   ctx-s[1] = le32_to_cpuvp(key + 20);
+   ctx-s[2] = le32_to_cpuvp(key + 24);
+   ctx-s[3] = le32_to_cpuvp(key + 28);
+
+   return 0;
+}
+
+static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
+   struct poly1305_ctx *ctx, const u8 *src,
+   unsigned int srclen, u32 hibit)
+{
+   u32 r0, r1, r2, r3, r4;
+   u32 s1, s2, s3, s4;
+   u32 h0, h1, h2, h3, h4;
+   u64 d0, d1, d2, d3, d4;
+
+   r0 = ctx-r[0];
+   r1 = ctx-r[1];
+   r2 = ctx-r[2];
+   r3 = ctx-r[3];
+   r4 = ctx-r[4];
+
+   s1 = r1 * 5;
+   s2 = r2 * 5;
+   s3 = r3 * 5;
+   s4 = r4 * 5

[PATCH 0/9] crypto: Add ChaCha20-Poly1305 AEAD support for IPsec

2015-06-01 Thread Martin Willi
This is a first version of a patch series implementing the ChaCha20-Poly1305
AEAD construction defined in RFC7539. It is based on the current cryptodev tree.

The first two patches implement the ChaCha20 cipher, the second two the Poly1305
authenticator, both in portable C for all architectures. Patch 5 and 6
provide an AEAD construction using the two cipher primitives, named rfc7539.

Patch 7 and 8 add a variant of the same AEAD that uses additional key material
as a nonce to shorten the explicit IV to 8 bytes, as defined for use in IPsec
in draft-ietf-ipsecme-chacha20-poly1305. The last patch exposes that AEAD
to IPsec users.

I don't expect any technical changes to draft-ietf-ipsecme-chacha20-poly1305,
but we don't have an RFC name yet to reference the AEAD. We therefore simply
name it rfc7539esp, but other suggestions are welcome.

The AEAD uses the crypto_nivaead_type to make it available to IPsec. However,
I was unable to run test vectors against this type of AEAD on cryptodev, but
I've verified the vectors against the same AEAD using crypto_aead_type.
Additionally IPsec traffic has been tested against our userland ESP backend in
strongSwan.

On my x64_64 test setup the IPsec throughput is ~700Mbits/s with these portable
drivers. Architecture specific drivers subject to a future patchset can improve
performance, for example with SSE doubling performance is feasible.

Martin Willi (9):
  crypto: Add a generic ChaCha20 stream cipher implementation
  crypto: testmgr - Add ChaCha20 test vectors from RFC7539
  crypto: Add a generic Poly1305 authenticator implementation
  crypto: testmgr - Add Poly1305 test vectors from RFC7539
  crypto: Add a ChaCha20-Poly1305 AEAD construction, RFC7539
  crypto: testmgr - Add ChaCha20-Poly1305 test vectors from RFC7539
  crypto: chacha20poly1305 - Add an IPsec variant for RFC7539 AEAD
  crypto: testmgr - Add draft-ietf-ipsecme-chacha20-poly1305 test vector
  xfrm: Define ChaCha20-Poly1305 AEAD XFRM algo for IPsec users

 crypto/Kconfig|  34 ++
 crypto/Makefile   |   3 +
 crypto/chacha20_generic.c | 216 +++
 crypto/chacha20poly1305.c | 687 +++
 crypto/poly1305_generic.c | 300 
 crypto/testmgr.c  |  54 +++
 crypto/testmgr.h  | 884 ++
 net/xfrm/xfrm_algo.c  |  12 +
 8 files changed, 2190 insertions(+)
 create mode 100644 crypto/chacha20_generic.c
 create mode 100644 crypto/chacha20poly1305.c
 create mode 100644 crypto/poly1305_generic.c

--
1.9.1
--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: CCM/GCM implementation defect

2015-04-23 Thread Martin Willi
Hi Herbert,

  Does this mean that even the test vectors (crypto/testmgr.h) are broken?
 
 Indeed.  The test vectors appear to be generated either through
 our implementation or by one that is identical to us.

I'm not sure about that. RFC4106 refers to [1] for test vectors, which
is still available at web.archive.org [2].

When looking for example at Test Case 3, this is the same as in a newer
revision of the document [3]. That looks exactly the same as
aes_gcm_enc_tv_template[2] from testmgr.h.

We by the way use test vectors in userland from the same document to
verify our own GCM backend, our OpenSSL backend and an AESNI/PCLMULQD
backend. And I've never heard of any incompatibilities.

Regards
Martin

[1]http://csrc.nist.gov/CryptoToolkit/modes/proposedmodes/gcm/gcm-spec.pdf
[2]http://web.archive.org/web/20070712195408/http://csrc.nist.gov/CryptoToolkit/modes/proposedmodes/gcm/gcm-spec.pdf
[3]http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: CCM/GCM implementation defect

2015-04-23 Thread Martin Willi
Hi Steffen,

  It looks like our IPsec implementations of CCM and GCM are buggy
  in that they don't include the IV in the authentication calculation.
 
 Seems like crypto_rfc4106_crypt() passes the associated data it
 got from ESP directly to gcm, without chaining with the IV.

Do you have any pointer for me where this is defined? Why is it needed,
given that GCM implicitly authenticates the IV by using it in Y0?

Also, I've just verified that a tunnel between the Windows Filtering
Platform and Linux 3.13 using AES128GCM16 works just fine. So if we do
something wrong, the problem does not only affect Linux.

Regards
Martin

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] xfrm: Traffic Flow Confidentiality for IPv4 ESP

2010-12-08 Thread Martin Willi

 In particular, why would we need a boundary at all? Setting it to
 anything other than the PMTU would seem to defeat the purpose of
 TFC for packets between the boundary and the PMTU.

I don't agree, this highly depends on the traffic on the SA. For a
general purpose tunnel with TCP flows, PMTU padding is fine. But if
there are only small packets (maybe SIP+RTP), padding to the PMTU is
very expensive.

The administrator setting up the SAs probably knows (or even controls
directly) what traffic it is used for, and might lower the boundary
accordingly.

Regards
Martin

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] xfrm: Traffic Flow Confidentiality for IPv6 ESP

2010-12-08 Thread Martin Willi
Add TFC padding to all packets smaller than the boundary configured
on the xfrm state. If the boundary is larger than the PMTU, limit
padding to the PMTU.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 net/ipv6/esp6.c |   32 
 1 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index ee9b93b..1b5c982 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -49,6 +49,8 @@ struct esp_skb_cb {
 
 #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)((__skb)-cb[0]))
 
+static u32 esp6_get_mtu(struct xfrm_state *x, int mtu);
+
 /*
  * Allocate an AEAD request structure with extra space for SG and IV.
  *
@@ -140,6 +142,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff 
*skb)
int blksize;
int clen;
int alen;
+   int plen;
+   int tfclen;
int nfrags;
u8 *iv;
u8 *tail;
@@ -148,18 +152,26 @@ static int esp6_output(struct xfrm_state *x, struct 
sk_buff *skb)
/* skb is pure payload to encrypt */
err = -ENOMEM;
 
-   /* Round to block size */
-   clen = skb-len;
-
aead = esp-aead;
alen = crypto_aead_authsize(aead);
 
+   tfclen = 0;
+   if (x-tfcpad) {
+   struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+   u32 padto;
+
+   padto = min(x-tfcpad, esp6_get_mtu(x, dst-child_mtu_cached));
+   if (skb-len  padto)
+   tfclen = padto - skb-len;
+   }
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-   clen = ALIGN(clen + 2, blksize);
+   clen = ALIGN(skb-len + 2 + tfclen, blksize);
if (esp-padlen)
clen = ALIGN(clen, esp-padlen);
+   plen = clen - skb-len - tfclen;
 
-   if ((err = skb_cow_data(skb, clen - skb-len + alen, trailer))  0)
+   err = skb_cow_data(skb, tfclen + plen + alen, trailer);
+   if (err  0)
goto error;
nfrags = err;
 
@@ -174,13 +186,17 @@ static int esp6_output(struct xfrm_state *x, struct 
sk_buff *skb)
 
/* Fill padding... */
tail = skb_tail_pointer(trailer);
+   if (tfclen) {
+   memset(tail, 0, tfclen);
+   tail += tfclen;
+   }
do {
int i;
-   for (i=0; iclen-skb-len - 2; i++)
+   for (i = 0; i  plen - 2; i++)
tail[i] = i + 1;
} while (0);
-   tail[clen-skb-len - 2] = (clen - skb-len) - 2;
-   tail[clen - skb-len - 1] = *skb_mac_header(skb);
+   tail[plen - 2] = plen - 2;
+   tail[plen - 1] = *skb_mac_header(skb);
pskb_put(skb, trailer, clen - skb-len + alen);
 
skb_push(skb, -skb_network_offset(skb));
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] xfrm: Traffic Flow Confidentiality for IPv4 ESP

2010-12-08 Thread Martin Willi
Add TFC padding to all packets smaller than the boundary configured
on the xfrm state. If the boundary is larger than the PMTU, limit
padding to the PMTU.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 net/ipv4/esp4.c |   32 
 1 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 14ca1f1..e42a905 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -23,6 +23,8 @@ struct esp_skb_cb {
 
 #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)((__skb)-cb[0]))
 
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
+
 /*
  * Allocate an AEAD request structure with extra space for SG and IV.
  *
@@ -117,25 +119,35 @@ static int esp_output(struct xfrm_state *x, struct 
sk_buff *skb)
int blksize;
int clen;
int alen;
+   int plen;
+   int tfclen;
int nfrags;
 
/* skb is pure payload to encrypt */
 
err = -ENOMEM;
 
-   /* Round to block size */
-   clen = skb-len;
-
esp = x-data;
aead = esp-aead;
alen = crypto_aead_authsize(aead);
 
+   tfclen = 0;
+   if (x-tfcpad) {
+   struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+   u32 padto;
+
+   padto = min(x-tfcpad, esp4_get_mtu(x, dst-child_mtu_cached));
+   if (skb-len  padto)
+   tfclen = padto - skb-len;
+   }
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-   clen = ALIGN(clen + 2, blksize);
+   clen = ALIGN(skb-len + 2 + tfclen, blksize);
if (esp-padlen)
clen = ALIGN(clen, esp-padlen);
+   plen = clen - skb-len - tfclen;
 
-   if ((err = skb_cow_data(skb, clen - skb-len + alen, trailer))  0)
+   err = skb_cow_data(skb, tfclen + plen + alen, trailer);
+   if (err  0)
goto error;
nfrags = err;
 
@@ -150,13 +162,17 @@ static int esp_output(struct xfrm_state *x, struct 
sk_buff *skb)
 
/* Fill padding... */
tail = skb_tail_pointer(trailer);
+   if (tfclen) {
+   memset(tail, 0, tfclen);
+   tail += tfclen;
+   }
do {
int i;
-   for (i=0; iclen-skb-len - 2; i++)
+   for (i = 0; i  plen - 2; i++)
tail[i] = i + 1;
} while (0);
-   tail[clen - skb-len - 2] = (clen - skb-len) - 2;
-   tail[clen - skb-len - 1] = *skb_mac_header(skb);
+   tail[plen - 2] = plen - 2;
+   tail[plen - 1] = *skb_mac_header(skb);
pskb_put(skb, trailer, clen - skb-len + alen);
 
skb_push(skb, -skb_network_offset(skb));
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] xfrm: ESP Traffic Flow Confidentiality padding (v3)

2010-12-08 Thread Martin Willi
The following patchset adds Traffic Flow Confidentiality padding. The
first patch introduces a new Netlink XFRM attribute to configure TFC via
userspace. Patch two and three implement the padding logic in IPv4 and
IPv6 ESP. Padding is always done using the RFC4303 format an is clamped
to the PMTU.

Changes from v2:
  - Remove unused flag field in attribute, use a plain u32 as attribute payload
  - Reject installation of TFC padding on non-tunnel SAs

Martin Willi (3):
  xfrm: Add Traffic Flow Confidentiality padding XFRM attribute
  xfrm: Traffic Flow Confidentiality for IPv4 ESP
  xfrm: Traffic Flow Confidentiality for IPv6 ESP

 include/linux/xfrm.h |1 +
 include/net/xfrm.h   |1 +
 net/ipv4/esp4.c  |   32 
 net/ipv6/esp6.c  |   32 
 net/xfrm/xfrm_user.c |   19 +--
 5 files changed, 67 insertions(+), 18 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] xfrm: Add Traffic Flow Confidentiality padding XFRM attribute

2010-12-08 Thread Martin Willi
The XFRMA_TFCPAD attribute for XFRM state installation configures
Traffic Flow Confidentiality by padding ESP packets to a specified
length.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 include/linux/xfrm.h |1 +
 include/net/xfrm.h   |1 +
 net/xfrm/xfrm_user.c |   19 +--
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index b971e38..930fdd2 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -283,6 +283,7 @@ enum xfrm_attr_type_t {
XFRMA_KMADDRESS,/* struct xfrm_user_kmaddress */
XFRMA_ALG_AUTH_TRUNC,   /* struct xfrm_algo_auth */
XFRMA_MARK, /* struct xfrm_mark */
+   XFRMA_TFCPAD,   /* __u32 */
__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index bcfb6b2..bdcade7 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -143,6 +143,7 @@ struct xfrm_state {
struct xfrm_id  id;
struct xfrm_selectorsel;
struct xfrm_markmark;
+   u32 tfcpad;
 
u32 genid;
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 8bae6b2..8eb8895 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -148,7 +148,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 !attrs[XFRMA_ALG_AUTH_TRUNC]) ||
attrs[XFRMA_ALG_AEAD]   ||
attrs[XFRMA_ALG_CRYPT]  ||
-   attrs[XFRMA_ALG_COMP])
+   attrs[XFRMA_ALG_COMP]   ||
+   attrs[XFRMA_TFCPAD])
goto out;
break;
 
@@ -165,6 +166,9 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 attrs[XFRMA_ALG_CRYPT]) 
attrs[XFRMA_ALG_AEAD])
goto out;
+   if (attrs[XFRMA_TFCPAD] 
+   p-mode != XFRM_MODE_TUNNEL)
+   goto out;
break;
 
case IPPROTO_COMP:
@@ -172,7 +176,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
attrs[XFRMA_ALG_AEAD]   ||
attrs[XFRMA_ALG_AUTH]   ||
attrs[XFRMA_ALG_AUTH_TRUNC] ||
-   attrs[XFRMA_ALG_CRYPT])
+   attrs[XFRMA_ALG_CRYPT]  ||
+   attrs[XFRMA_TFCPAD])
goto out;
break;
 
@@ -186,6 +191,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
attrs[XFRMA_ALG_CRYPT]  ||
attrs[XFRMA_ENCAP]  ||
attrs[XFRMA_SEC_CTX]||
+   attrs[XFRMA_TFCPAD] ||
!attrs[XFRMA_COADDR])
goto out;
break;
@@ -439,6 +445,9 @@ static struct xfrm_state *xfrm_state_construct(struct net 
*net,
goto error;
}
 
+   if (attrs[XFRMA_TFCPAD])
+   x-tfcpad = nla_get_u32(attrs[XFRMA_TFCPAD]);
+
if (attrs[XFRMA_COADDR]) {
x-coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]),
sizeof(*x-coaddr), GFP_KERNEL);
@@ -688,6 +697,9 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
if (x-encap)
NLA_PUT(skb, XFRMA_ENCAP, sizeof(*x-encap), x-encap);
 
+   if (x-tfcpad)
+   NLA_PUT_U32(skb, XFRMA_TFCPAD, x-tfcpad);
+
if (xfrm_mark_put(skb, x-mark))
goto nla_put_failure;
 
@@ -2122,6 +2134,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] 
= {
[XFRMA_MIGRATE] = { .len = sizeof(struct xfrm_user_migrate) },
[XFRMA_KMADDRESS]   = { .len = sizeof(struct xfrm_user_kmaddress) },
[XFRMA_MARK]= { .len = sizeof(struct xfrm_mark) },
+   [XFRMA_TFCPAD]  = { .type = NLA_U32 },
 };
 
 static struct xfrm_link {
@@ -2301,6 +2314,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
l += nla_total_size(sizeof(*x-calg));
if (x-encap)
l += nla_total_size(sizeof(*x-encap));
+   if (x-tfcpad)
+   l += nla_total_size(sizeof(x-tfcpad));
if (x-security)
l += nla_total_size(sizeof(struct xfrm_user_sec_ctx) +
x-security-ctx_len);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] xfrm: Traffic Flow Confidentiality for IPv4 ESP

2010-12-07 Thread Martin Willi
Add TFC padding to all packets smaller than the boundary configured
on the xfrm state. If the boundary is larger than the PMTU, limit
padding to the PMTU.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 net/ipv4/esp4.c |   33 +
 1 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 14ca1f1..e7784e8 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -23,6 +23,8 @@ struct esp_skb_cb {
 
 #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)((__skb)-cb[0]))
 
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
+
 /*
  * Allocate an AEAD request structure with extra space for SG and IV.
  *
@@ -117,25 +119,36 @@ static int esp_output(struct xfrm_state *x, struct 
sk_buff *skb)
int blksize;
int clen;
int alen;
+   int plen;
+   int tfclen;
int nfrags;
 
/* skb is pure payload to encrypt */
 
err = -ENOMEM;
 
-   /* Round to block size */
-   clen = skb-len;
-
esp = x-data;
aead = esp-aead;
alen = crypto_aead_authsize(aead);
 
+   tfclen = 0;
+   if (x-tfc.pad  x-props.mode == XFRM_MODE_TUNNEL) {
+   struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+   u32 mtu, padto;
+
+   mtu = esp4_get_mtu(x, dst-child_mtu_cached);
+   padto = min_t(u32, x-tfc.pad, mtu);
+   if (skb-len  padto)
+   tfclen = padto - skb-len;
+   }
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-   clen = ALIGN(clen + 2, blksize);
+   clen = ALIGN(skb-len + 2 + tfclen, blksize);
if (esp-padlen)
clen = ALIGN(clen, esp-padlen);
+   plen = clen - skb-len - tfclen;
 
-   if ((err = skb_cow_data(skb, clen - skb-len + alen, trailer))  0)
+   err = skb_cow_data(skb, tfclen + plen + alen, trailer);
+   if (err  0)
goto error;
nfrags = err;
 
@@ -150,13 +163,17 @@ static int esp_output(struct xfrm_state *x, struct 
sk_buff *skb)
 
/* Fill padding... */
tail = skb_tail_pointer(trailer);
+   if (tfclen) {
+   memset(tail, 0, tfclen);
+   tail += tfclen;
+   }
do {
int i;
-   for (i=0; iclen-skb-len - 2; i++)
+   for (i = 0; i  plen - 2; i++)
tail[i] = i + 1;
} while (0);
-   tail[clen - skb-len - 2] = (clen - skb-len) - 2;
-   tail[clen - skb-len - 1] = *skb_mac_header(skb);
+   tail[plen - 2] = plen - 2;
+   tail[plen - 1] = *skb_mac_header(skb);
pskb_put(skb, trailer, clen - skb-len + alen);
 
skb_push(skb, -skb_network_offset(skb));
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] xfrm: Traffic Flow Confidentiality for IPv6 ESP

2010-12-07 Thread Martin Willi
Add TFC padding to all packets smaller than the boundary configured
on the xfrm state. If the boundary is larger than the PMTU, limit
padding to the PMTU.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 net/ipv6/esp6.c |   33 +
 1 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index ee9b93b..8b493b0 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -49,6 +49,8 @@ struct esp_skb_cb {
 
 #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)((__skb)-cb[0]))
 
+static u32 esp6_get_mtu(struct xfrm_state *x, int mtu);
+
 /*
  * Allocate an AEAD request structure with extra space for SG and IV.
  *
@@ -140,6 +142,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff 
*skb)
int blksize;
int clen;
int alen;
+   int plen;
+   int tfclen;
int nfrags;
u8 *iv;
u8 *tail;
@@ -148,18 +152,27 @@ static int esp6_output(struct xfrm_state *x, struct 
sk_buff *skb)
/* skb is pure payload to encrypt */
err = -ENOMEM;
 
-   /* Round to block size */
-   clen = skb-len;
-
aead = esp-aead;
alen = crypto_aead_authsize(aead);
 
+   tfclen = 0;
+   if (x-tfc.pad  x-props.mode == XFRM_MODE_TUNNEL) {
+   struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+   u32 mtu, padto;
+
+   mtu = esp6_get_mtu(x, dst-child_mtu_cached);
+   padto = min_t(u32, x-tfc.pad, mtu);
+   if (skb-len  padto)
+   tfclen = padto - skb-len;
+   }
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-   clen = ALIGN(clen + 2, blksize);
+   clen = ALIGN(skb-len + 2 + tfclen, blksize);
if (esp-padlen)
clen = ALIGN(clen, esp-padlen);
+   plen = clen - skb-len - tfclen;
 
-   if ((err = skb_cow_data(skb, clen - skb-len + alen, trailer))  0)
+   err = skb_cow_data(skb, tfclen + plen + alen, trailer);
+   if (err  0)
goto error;
nfrags = err;
 
@@ -174,13 +187,17 @@ static int esp6_output(struct xfrm_state *x, struct 
sk_buff *skb)
 
/* Fill padding... */
tail = skb_tail_pointer(trailer);
+   if (tfclen) {
+   memset(tail, 0, tfclen);
+   tail += tfclen;
+   }
do {
int i;
-   for (i=0; iclen-skb-len - 2; i++)
+   for (i = 0; i  plen - 2; i++)
tail[i] = i + 1;
} while (0);
-   tail[clen-skb-len - 2] = (clen - skb-len) - 2;
-   tail[clen - skb-len - 1] = *skb_mac_header(skb);
+   tail[plen - 2] = plen - 2;
+   tail[plen - 1] = *skb_mac_header(skb);
pskb_put(skb, trailer, clen - skb-len + alen);
 
skb_push(skb, -skb_network_offset(skb));
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] xfrm: ESP Traffic Flow Confidentiality padding (v2)

2010-12-07 Thread Martin Willi
The following patchset adds Traffic Flow Confidentiality padding. The
first patch introduces a new Netlink XFRM attribute to configure TFC via
userspace. Patch two and three implement the padding logic in IPv4 and
IPv6 ESP. Padding is always done using the RFC4303 format an is clamped
to the PMTU.

Changes from v1:
  - Always clamp padding length to never exceed PMTU
  - Remove XFRM_TFC_PMTU flag, use USHRT_MAX padding length instead
  - Remove ESPv2 padding fallback due to the concerns from Herbert
  - Keep the existing ESP padlen field, as we don't mangle ESP padding

I've kept the currently unused flags in the XFRM attribute to implement
ESPv2 fallback or other extensions in the future without changing the ABI.

Martin Willi (3):
  xfrm: Add Traffic Flow Confidentiality padding XFRM attribute
  xfrm: Traffic Flow Confidentiality for IPv4 ESP
  xfrm: Traffic Flow Confidentiality for IPv6 ESP

 include/linux/xfrm.h |6 ++
 include/net/xfrm.h   |1 +
 net/ipv4/esp4.c  |   33 +
 net/ipv6/esp6.c  |   33 +
 net/xfrm/xfrm_user.c |   16 ++--
 5 files changed, 71 insertions(+), 18 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] xfrm: Add Traffic Flow Confidentiality padding XFRM attribute

2010-12-07 Thread Martin Willi
The XFRMA_TFC attribute for XFRM state installation configures
Traffic Flow Confidentiality by padding ESP packets to a specified
length.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 include/linux/xfrm.h |6 ++
 include/net/xfrm.h   |1 +
 net/xfrm/xfrm_user.c |   16 ++--
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index b971e38..7cd5232 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -283,6 +283,7 @@ enum xfrm_attr_type_t {
XFRMA_KMADDRESS,/* struct xfrm_user_kmaddress */
XFRMA_ALG_AUTH_TRUNC,   /* struct xfrm_algo_auth */
XFRMA_MARK, /* struct xfrm_mark */
+   XFRMA_TFC,  /* struct xfrm_tfc */
__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
@@ -293,6 +294,11 @@ struct xfrm_mark {
__u32   m; /* mask */
 };
 
+struct xfrm_tfc {
+   __u16   pad;
+   __u16   flags;
+};
+
 enum xfrm_sadattr_type_t {
XFRMA_SAD_UNSPEC,
XFRMA_SAD_CNT,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index bcfb6b2..03468c0 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -143,6 +143,7 @@ struct xfrm_state {
struct xfrm_id  id;
struct xfrm_selectorsel;
struct xfrm_markmark;
+   struct xfrm_tfc tfc;
 
u32 genid;
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 8bae6b2..df6a60f 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -148,7 +148,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 !attrs[XFRMA_ALG_AUTH_TRUNC]) ||
attrs[XFRMA_ALG_AEAD]   ||
attrs[XFRMA_ALG_CRYPT]  ||
-   attrs[XFRMA_ALG_COMP])
+   attrs[XFRMA_ALG_COMP]   ||
+   attrs[XFRMA_TFC])
goto out;
break;
 
@@ -172,7 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
attrs[XFRMA_ALG_AEAD]   ||
attrs[XFRMA_ALG_AUTH]   ||
attrs[XFRMA_ALG_AUTH_TRUNC] ||
-   attrs[XFRMA_ALG_CRYPT])
+   attrs[XFRMA_ALG_CRYPT]  ||
+   attrs[XFRMA_TFC])
goto out;
break;
 
@@ -186,6 +188,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
attrs[XFRMA_ALG_CRYPT]  ||
attrs[XFRMA_ENCAP]  ||
attrs[XFRMA_SEC_CTX]||
+   attrs[XFRMA_TFC]||
!attrs[XFRMA_COADDR])
goto out;
break;
@@ -439,6 +442,9 @@ static struct xfrm_state *xfrm_state_construct(struct net 
*net,
goto error;
}
 
+   if (attrs[XFRMA_TFC])
+   memcpy(x-tfc, nla_data(attrs[XFRMA_TFC]), sizeof(x-tfc));
+
if (attrs[XFRMA_COADDR]) {
x-coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]),
sizeof(*x-coaddr), GFP_KERNEL);
@@ -688,6 +694,9 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
if (x-encap)
NLA_PUT(skb, XFRMA_ENCAP, sizeof(*x-encap), x-encap);
 
+   if (x-tfc.pad)
+   NLA_PUT(skb, XFRMA_TFC, sizeof(x-tfc), x-tfc);
+
if (xfrm_mark_put(skb, x-mark))
goto nla_put_failure;
 
@@ -2122,6 +2131,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] 
= {
[XFRMA_MIGRATE] = { .len = sizeof(struct xfrm_user_migrate) },
[XFRMA_KMADDRESS]   = { .len = sizeof(struct xfrm_user_kmaddress) },
[XFRMA_MARK]= { .len = sizeof(struct xfrm_mark) },
+   [XFRMA_TFC] = { .len = sizeof(struct xfrm_tfc) },
 };
 
 static struct xfrm_link {
@@ -2301,6 +2311,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
l += nla_total_size(sizeof(*x-calg));
if (x-encap)
l += nla_total_size(sizeof(*x-encap));
+   if (x-tfc.pad)
+   l += nla_total_size(sizeof(x-tfc));
if (x-security)
l += nla_total_size(sizeof(struct xfrm_user_sec_ctx) +
x-security-ctx_len);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/5] xfrm: Traffic Flow Confidentiality for IPv4 ESP

2010-12-06 Thread Martin Willi
Hi Herbert,

 I know why you want to do this, what I'm asking is do you have any
 research behind this with regards to security 
 
 Has this scheme been discussed on a public forum somewhere?

No, sorry, I haven't found much valuable discussion about TFC padding.
Nothing at all how to overcome the ESPv2 padding limit.

 using an insecure RNG to generate a value that is then used as the
 basis for concealment

Using get_random_bytes() adds another ~10% processing overhead due to
the underlying sha_transform. But this is probably negligible, we add
much more with the additional padding to encrypt/MAC.

I'll re-spin the patchset with get_random_bytes(). Even if the ESPv2
padding fallback makes TFC in this case less efficient, it shouldn't
harm. Or do you see this differently?

Regards
Martin

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/5] xfrm: Traffic Flow Confidentiality for IPv4 ESP

2010-12-03 Thread Martin Willi

 What is the basis of this random length padding?

Let assume a peer does not support ESPv3 padding, but we have to pad a
small packet with more than 255 bytes. We can't, the ESP padding length
field is limited to 255.
We could add 255 fixed bytes, but an eavesdropper could just subtract
the 255 bytes from all packets smaller than the boundary, rendering our
TFC efforts useless.
By inserting a random length padding in the range possible, the
eavesdropper knows that the packet has a length between length and
length - 255, but can't estimated its exact size. I'm aware that this
is not optimal, but probably the best we can do(?).

 Also, what happens when padto exceeds the MTU? Doesn't this
 effectively disable PMTU-discovery?

Yes. An administrator setting a padto value larger than PMTU can
currently break PMTU discovery.

 I know that your last patch allows the padto to be set by PMTU.
 But why would we ever want to use a padto that isn't clamped by
 PMTU?

Probably never, valid point.

I'll add PMTU clamping to the next revision. We probably can drop the
PMTU flag then and just use USHRT_MAX instead. 

Thanks!
Martin

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] xfrm: Traffic Flow Confidentiality for IPv4 ESP

2010-11-30 Thread Martin Willi
If configured on xfrm state, increase the length of all packets to
a given boundary using TFC padding as specified in RFC4303. For
transport mode, or if the XFRM_TFC_ESPV3 is not set, grow the ESP
padding field instead.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 net/ipv4/esp4.c |   42 +-
 1 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 67e4c12..a6adfbc 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -117,23 +117,43 @@ static int esp_output(struct xfrm_state *x, struct 
sk_buff *skb)
int blksize;
int clen;
int alen;
+   int plen;
+   int tfclen;
+   int tfcpadto;
int nfrags;
 
/* skb is pure payload to encrypt */
 
err = -ENOMEM;
 
-   /* Round to block size */
-   clen = skb-len;
-
esp = x-data;
aead = esp-aead;
alen = crypto_aead_authsize(aead);
 
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-   clen = ALIGN(clen + 2, blksize);
-
-   if ((err = skb_cow_data(skb, clen - skb-len + alen, trailer))  0)
+   tfclen = 0;
+   tfcpadto = x-tfc.pad;
+
+   if (skb-len = tfcpadto) {
+   clen = ALIGN(skb-len + 2, blksize);
+   } else if (x-tfc.flags  XFRM_TFC_ESPV3 
+  x-props.mode == XFRM_MODE_TUNNEL) {
+   /* ESPv3 TFC padding, append bytes to payload */
+   tfclen = tfcpadto - skb-len;
+   clen = ALIGN(skb-len + 2 + tfclen, blksize);
+   } else {
+   /* ESPv2 TFC padding. If we exceed the 255 byte maximum, use
+* random padding to hide payload length as good as possible. */
+   clen = ALIGN(skb-len + 2 + tfcpadto - skb-len, blksize);
+   if (clen - skb-len - 2  255) {
+   clen = ALIGN(skb-len + (u8)random32() + 2, blksize);
+   if (clen - skb-len - 2  255)
+   clen -= blksize;
+   }
+   }
+   plen = clen - skb-len - tfclen;
+   err = skb_cow_data(skb, tfclen + plen + alen, trailer);
+   if (err  0)
goto error;
nfrags = err;
 
@@ -148,13 +168,17 @@ static int esp_output(struct xfrm_state *x, struct 
sk_buff *skb)
 
/* Fill padding... */
tail = skb_tail_pointer(trailer);
+   if (tfclen) {
+   memset(tail, 0, tfclen);
+   tail += tfclen;
+   }
do {
int i;
-   for (i=0; iclen-skb-len - 2; i++)
+   for (i = 0; i  plen - 2; i++)
tail[i] = i + 1;
} while (0);
-   tail[clen - skb-len - 2] = (clen - skb-len) - 2;
-   tail[clen - skb-len - 1] = *skb_mac_header(skb);
+   tail[plen - 2] = plen - 2;
+   tail[plen - 1] = *skb_mac_header(skb);
pskb_put(skb, trailer, clen - skb-len + alen);
 
skb_push(skb, -skb_network_offset(skb));
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] xfrm: Remove unused ESP padlen field

2010-11-30 Thread Martin Willi
The padlen field in IPv4/6 ESP is used to align the ESP padding length
to a value larger than the aead block size. There is however no
option to set this field, hence it is removed.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 include/net/esp.h |3 ---
 net/ipv4/esp4.c   |   11 ++-
 net/ipv6/esp6.c   |   11 ++-
 3 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/include/net/esp.h b/include/net/esp.h
index d584513..6dfb4d0 100644
--- a/include/net/esp.h
+++ b/include/net/esp.h
@@ -6,9 +6,6 @@
 struct crypto_aead;
 
 struct esp_data {
-   /* 0..255 */
-   int padlen;
-
/* Confidentiality  Integrity */
struct crypto_aead *aead;
 };
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 14ca1f1..67e4c12 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -132,8 +132,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff 
*skb)
 
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
clen = ALIGN(clen + 2, blksize);
-   if (esp-padlen)
-   clen = ALIGN(clen, esp-padlen);
 
if ((err = skb_cow_data(skb, clen - skb-len + alen, trailer))  0)
goto error;
@@ -386,12 +384,11 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
 {
struct esp_data *esp = x-data;
u32 blksize = ALIGN(crypto_aead_blocksize(esp-aead), 4);
-   u32 align = max_t(u32, blksize, esp-padlen);
u32 rem;
 
mtu -= x-props.header_len + crypto_aead_authsize(esp-aead);
-   rem = mtu  (align - 1);
-   mtu = ~(align - 1);
+   rem = mtu  (blksize - 1);
+   mtu = ~(blksize - 1);
 
switch (x-props.mode) {
case XFRM_MODE_TUNNEL:
@@ -570,8 +567,6 @@ static int esp_init_state(struct xfrm_state *x)
 
aead = esp-aead;
 
-   esp-padlen = 0;
-
x-props.header_len = sizeof(struct ip_esp_hdr) +
  crypto_aead_ivsize(aead);
if (x-props.mode == XFRM_MODE_TUNNEL)
@@ -594,8 +589,6 @@ static int esp_init_state(struct xfrm_state *x)
}
 
align = ALIGN(crypto_aead_blocksize(aead), 4);
-   if (esp-padlen)
-   align = max_t(u32, align, esp-padlen);
x-props.trailer_len = align + 1 + crypto_aead_authsize(esp-aead);
 
 error:
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index ee9b93b..e9e6e1c 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -156,8 +156,6 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff 
*skb)
 
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
clen = ALIGN(clen + 2, blksize);
-   if (esp-padlen)
-   clen = ALIGN(clen, esp-padlen);
 
if ((err = skb_cow_data(skb, clen - skb-len + alen, trailer))  0)
goto error;
@@ -337,12 +335,11 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
 {
struct esp_data *esp = x-data;
u32 blksize = ALIGN(crypto_aead_blocksize(esp-aead), 4);
-   u32 align = max_t(u32, blksize, esp-padlen);
u32 rem;
 
mtu -= x-props.header_len + crypto_aead_authsize(esp-aead);
-   rem = mtu  (align - 1);
-   mtu = ~(align - 1);
+   rem = mtu  (blksize - 1);
+   mtu = ~(blksize - 1);
 
if (x-props.mode != XFRM_MODE_TUNNEL) {
u32 padsize = ((blksize - 1)  7) + 1;
@@ -516,8 +513,6 @@ static int esp6_init_state(struct xfrm_state *x)
 
aead = esp-aead;
 
-   esp-padlen = 0;
-
x-props.header_len = sizeof(struct ip_esp_hdr) +
  crypto_aead_ivsize(aead);
switch (x-props.mode) {
@@ -536,8 +531,6 @@ static int esp6_init_state(struct xfrm_state *x)
}
 
align = ALIGN(crypto_aead_blocksize(aead), 4);
-   if (esp-padlen)
-   align = max_t(u32, align, esp-padlen);
x-props.trailer_len = align + 1 + crypto_aead_authsize(esp-aead);
 
 error:
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/5] xfrm: Traffic Flow Confidentiality for IPv6 ESP

2010-11-30 Thread Martin Willi
If configured on xfrm state, increase the length of all packets to
a given boundary using TFC padding as specified in RFC4303. For
transport mode, or if the XFRM_TFC_ESPV3 is not set, grow the ESP
padding field instead.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 net/ipv6/esp6.c |   42 +-
 1 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index e9e6e1c..9494cb1 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -140,6 +140,9 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff 
*skb)
int blksize;
int clen;
int alen;
+   int plen;
+   int tfclen;
+   int tfcpadto;
int nfrags;
u8 *iv;
u8 *tail;
@@ -148,16 +151,33 @@ static int esp6_output(struct xfrm_state *x, struct 
sk_buff *skb)
/* skb is pure payload to encrypt */
err = -ENOMEM;
 
-   /* Round to block size */
-   clen = skb-len;
-
aead = esp-aead;
alen = crypto_aead_authsize(aead);
 
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-   clen = ALIGN(clen + 2, blksize);
-
-   if ((err = skb_cow_data(skb, clen - skb-len + alen, trailer))  0)
+   tfclen = 0;
+   tfcpadto = x-tfc.pad;
+
+   if (skb-len = tfcpadto) {
+   clen = ALIGN(skb-len + 2, blksize);
+   } else if (x-tfc.flags  XFRM_TFC_ESPV3 
+  x-props.mode == XFRM_MODE_TUNNEL) {
+   /* ESPv3 TFC padding, append bytes to payload */
+   tfclen = tfcpadto - skb-len;
+   clen = ALIGN(skb-len + 2 + tfclen, blksize);
+   } else {
+   /* ESPv2 TFC padding. If we exceed the 255 byte maximum, use
+* random padding to hide payload length as good as possible. */
+   clen = ALIGN(skb-len + 2 + tfcpadto - skb-len, blksize);
+   if (clen - skb-len - 2  255) {
+   clen = ALIGN(skb-len + (u8)random32() + 2, blksize);
+   if (clen - skb-len - 2  255)
+   clen -= blksize;
+   }
+   }
+   plen = clen - skb-len - tfclen;
+   err = skb_cow_data(skb, tfclen + plen + alen, trailer);
+   if (err  0)
goto error;
nfrags = err;
 
@@ -172,13 +192,17 @@ static int esp6_output(struct xfrm_state *x, struct 
sk_buff *skb)
 
/* Fill padding... */
tail = skb_tail_pointer(trailer);
+   if (tfclen) {
+   memset(tail, 0, tfclen);
+   tail += tfclen;
+   }
do {
int i;
-   for (i=0; iclen-skb-len - 2; i++)
+   for (i = 0; i  plen - 2; i++)
tail[i] = i + 1;
} while (0);
-   tail[clen-skb-len - 2] = (clen - skb-len) - 2;
-   tail[clen - skb-len - 1] = *skb_mac_header(skb);
+   tail[plen - 2] = plen - 2;
+   tail[plen - 1] = *skb_mac_header(skb);
pskb_put(skb, trailer, clen - skb-len + alen);
 
skb_push(skb, -skb_network_offset(skb));
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/5] xfrm: ESP Traffic Flow Confidentiality padding

2010-11-30 Thread Martin Willi
The following patchset adds Traffic Flow Confidentiality padding. The
first patch introduces a new Netlink XFRM attribute to configure TFC via
userspace. The second patch removes an existing padlen option in ESP; It
is not used at all, and I currently don't see the purpose of the field,
nor how it should interact with TFC padding enabled. Patch three and four
implement the padding logic in IPv4 and IPv6 ESP.

Padding is specified with a length to pad the encapsulated data to.
Support for TFC padding as specified in RFC4303 must be negotiated
explicitly by the key management protocol, hence the optional flag. The
fallback with ESP padding field expansion is limited to 255 padding
bytes. If this is insufficient, padding length is randomized to hide
the real length as good as possible.

The last patch adds an option to pad all packets to the PMTU. It works
fine for simple scenarios, but I'm not sure if my PMTU lookup works in
all cases (nested transforms?). Any pointer would be appreciated.

Martin Willi (5):
  xfrm: Add Traffic Flow Confidentiality padding XFRM attribute
  xfrm: Remove unused ESP padlen field
  xfrm: Traffic Flow Confidentiality for IPv4 ESP
  xfrm: Traffic Flow Confidentiality for IPv6 ESP
  xfrm: Add TFC padding option to automatically pad to PMTU

 include/linux/xfrm.h |8 +++
 include/net/esp.h|3 --
 include/net/xfrm.h   |1 +
 net/ipv4/esp4.c  |   58 +++--
 net/ipv6/esp6.c  |   58 +++--
 net/xfrm/xfrm_user.c |   16 -
 6 files changed, 105 insertions(+), 39 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] xfrm: Add Traffic Flow Confidentiality padding XFRM attribute

2010-11-30 Thread Martin Willi
The XFRMA_TFCPAD attribute for XFRM state installation configures
Traffic Flow Confidentiality by padding ESP packets to a specified
length. To use RFC4303 TFC padding and overcome the 255 byte ESP
padding field limit, the XFRM_TFC_ESPV3 flag must be set.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 include/linux/xfrm.h |7 +++
 include/net/xfrm.h   |1 +
 net/xfrm/xfrm_user.c |   16 ++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index b971e38..b1e5f8a 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -283,6 +283,7 @@ enum xfrm_attr_type_t {
XFRMA_KMADDRESS,/* struct xfrm_user_kmaddress */
XFRMA_ALG_AUTH_TRUNC,   /* struct xfrm_algo_auth */
XFRMA_MARK, /* struct xfrm_mark */
+   XFRMA_TFC,  /* struct xfrm_tfc */
__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
@@ -293,6 +294,12 @@ struct xfrm_mark {
__u32   m; /* mask */
 };
 
+struct xfrm_tfc {
+   __u16   pad;
+   __u16   flags;
+#define XFRM_TFC_ESPV3 1   /* RFC4303 TFC padding, if possible */
+};
+
 enum xfrm_sadattr_type_t {
XFRMA_SAD_UNSPEC,
XFRMA_SAD_CNT,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index bcfb6b2..03468c0 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -143,6 +143,7 @@ struct xfrm_state {
struct xfrm_id  id;
struct xfrm_selectorsel;
struct xfrm_markmark;
+   struct xfrm_tfc tfc;
 
u32 genid;
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 8bae6b2..0b4ec02 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -148,7 +148,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 !attrs[XFRMA_ALG_AUTH_TRUNC]) ||
attrs[XFRMA_ALG_AEAD]   ||
attrs[XFRMA_ALG_CRYPT]  ||
-   attrs[XFRMA_ALG_COMP])
+   attrs[XFRMA_ALG_COMP]   ||
+   attrs[XFRMA_TFC])
goto out;
break;
 
@@ -172,7 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
attrs[XFRMA_ALG_AEAD]   ||
attrs[XFRMA_ALG_AUTH]   ||
attrs[XFRMA_ALG_AUTH_TRUNC] ||
-   attrs[XFRMA_ALG_CRYPT])
+   attrs[XFRMA_ALG_CRYPT]  ||
+   attrs[XFRMA_TFC])
goto out;
break;
 
@@ -186,6 +188,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
attrs[XFRMA_ALG_CRYPT]  ||
attrs[XFRMA_ENCAP]  ||
attrs[XFRMA_SEC_CTX]||
+   attrs[XFRMA_TFC]||
!attrs[XFRMA_COADDR])
goto out;
break;
@@ -439,6 +442,9 @@ static struct xfrm_state *xfrm_state_construct(struct net 
*net,
goto error;
}
 
+   if (attrs[XFRMA_TFC])
+   memcpy(x-tfc, nla_data(attrs[XFRMA_TFC]), sizeof(x-tfc));
+
if (attrs[XFRMA_COADDR]) {
x-coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]),
sizeof(*x-coaddr), GFP_KERNEL);
@@ -688,6 +694,9 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
if (x-encap)
NLA_PUT(skb, XFRMA_ENCAP, sizeof(*x-encap), x-encap);
 
+   if (x-tfc.pad || x-tfc.flags)
+   NLA_PUT(skb, XFRMA_TFC, sizeof(x-tfc), x-tfc);
+
if (xfrm_mark_put(skb, x-mark))
goto nla_put_failure;
 
@@ -2122,6 +2131,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] 
= {
[XFRMA_MIGRATE] = { .len = sizeof(struct xfrm_user_migrate) },
[XFRMA_KMADDRESS]   = { .len = sizeof(struct xfrm_user_kmaddress) },
[XFRMA_MARK]= { .len = sizeof(struct xfrm_mark) },
+   [XFRMA_TFC] = { .len = sizeof(struct xfrm_tfc) },
 };
 
 static struct xfrm_link {
@@ -2301,6 +2311,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
l += nla_total_size(sizeof(*x-calg));
if (x-encap)
l += nla_total_size(sizeof(*x-encap));
+   if (x-tfc.pad)
+   l += nla_total_size(sizeof(x-tfc));
if (x-security)
l += nla_total_size(sizeof(struct xfrm_user_sec_ctx) +
x-security-ctx_len);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] crypto: algif_skcipher - User-space interface for skcipher operations

2010-11-15 Thread Martin Willi

 This patch adds the af_alg plugin for symmetric key ciphers,
 corresponding to the ablkcipher kernel operation type.

I can confirm that the newest patch fixes the page leak.

Tested-by: Martin Willi mar...@strongswan.org

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] crypto: af_alg - User-space interface for Crypto API

2010-11-15 Thread Martin Willi

 This patch creates the backbone of the user-space interface for
 the Crypto API, through a new socket family AF_ALG.

Tested-by: Martin Willi mar...@strongswan.org


--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] crypto: algif_skcipher - User-space interface for skcipher operations

2010-11-08 Thread Martin Willi

 Hmm, can you show me your test program and how you determined
 that it was leaking pages?

The test program below runs 1000 encryptions:

# grep nr_free /proc/vmstat 
nr_free_pages 11031
# ./test
...
# grep nr_free /proc/vmstat 
nr_free_pages 10026
# ./test
...
# grep nr_free /proc/vmstat 
nr_free_pages 9027
# ./test
...
# grep nr_free /proc/vmstat 
nr_free_pages 8025

Regards
Martin

--
#include stdio.h
#include unistd.h
#include stddef.h
#include string.h
#include sys/socket.h
#include linux/if_alg.h

int main()
{
int tfm, i;
char key[16];

struct sockaddr_alg sa = {
.salg_family = AF_ALG,
.salg_type = skcipher,
.salg_name = cbc(aes),
};

tfm = socket(AF_ALG, SOCK_SEQPACKET, 0);
if (tfm == -1 ||
bind(tfm, (struct sockaddr*)sa, sizeof(sa)) == -1)
{
return 1;
}
memset(key, 0x34, sizeof(key));
if (setsockopt(tfm, SOL_ALG, ALG_SET_KEY,
key, sizeof(key)) == -1)
{
return 1;
}

for (i = 0; i  1000; i++)
{
struct msghdr msg = {};
struct cmsghdr *cmsg;
struct af_alg_iv *ivm;
u_int32_t type;
struct iovec iov;
char buf[CMSG_SPACE(sizeof(type)) +
 CMSG_SPACE(offsetof(struct af_alg_iv, iv)+16)];
char data[64];
ssize_t len;
int op;

op = accept(tfm, NULL, 0);
if (op == -1)
{
return 1;
}

type = ALG_OP_ENCRYPT;
memset(data, 0x12, sizeof(data));
memset(buf, 0, sizeof(buf));

msg.msg_control = buf;
msg.msg_controllen = sizeof(buf);

cmsg = CMSG_FIRSTHDR(msg);
cmsg-cmsg_level = SOL_ALG;
cmsg-cmsg_type = ALG_SET_OP;
cmsg-cmsg_len = CMSG_LEN(sizeof(type));
*(u_int32_t*)CMSG_DATA(cmsg) = type;

cmsg = CMSG_NXTHDR(msg, cmsg);
cmsg-cmsg_level = SOL_ALG;
cmsg-cmsg_type = ALG_SET_IV;
cmsg-cmsg_len = CMSG_LEN(
offsetof(struct af_alg_iv, iv) + 16);
ivm = (void*)CMSG_DATA(cmsg);
ivm-ivlen = 16;
memset(ivm-iv, 0x23, 16);

msg.msg_iov = iov;
msg.msg_iovlen = 1;

iov.iov_base = data;
iov.iov_len = sizeof(data);

len = sendmsg(op, msg, 0);
if (len != sizeof(data))
{
return 1;
}
if (read(op, data, len) != len)
{
return 1;
}
printf(.);
fflush(stdout);
close(op);
}
close(tfm);
printf(\n);
return 0;
}

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] crypto: algif_skcipher - User-space interface for skcipher operations

2010-11-06 Thread Martin Willi
Hi Herbert,

I did a proof-of-concept implementation for our crypto library, the
interface looks good so far. All our hash, hmac, xcbc and cipher test
vectors matched.

 + sg_assign_page(sg + i, alloc_page(GFP_KERNEL));

Every skcipher operation leaks memory on my box (this page?). Should be
reproducible by doing encryption with any cipher.

Regards
Martin

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] xfrm: Fix truncation length of authentication algorithms installed via PF_KEY

2009-12-09 Thread Martin Willi
Commit 4447bb33f09444920a8f1d89e1540137429351b6 breaks installation of
authentication algorithms via PF_KEY, as the state specific truncation
length is not installed with the algorithms default truncation length.
This patch initializes state properly to the default if installed via
PF_KEY.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 net/key/af_key.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 84209fb..76fa6fe 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1193,6 +1193,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct 
net *net,
x-aalg-alg_key_len = key-sadb_key_bits;
memcpy(x-aalg-alg_key, key+1, keysize);
}
+   x-aalg-alg_trunc_len = a-uinfo.auth.icv_truncbits;
x-props.aalgo = sa-sadb_sa_auth;
/* x-algo.flags = sa-sadb_sa_flags; */
}
-- 
1.6.3.3

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] xfrm: Use the user specified truncation length in ESP and AH

2009-11-25 Thread Martin Willi
Instead of using the hardcoded truncation for authentication
algorithms, use the truncation length specified on xfrm_state.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 net/ipv4/ah4.c  |2 +-
 net/ipv4/esp4.c |2 +-
 net/ipv6/ah6.c  |2 +-
 net/ipv6/esp6.c |2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 5c66270..b7be5ed 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -261,7 +261,7 @@ static int ah_init_state(struct xfrm_state *x)
}
 
ahp-icv_full_len = aalg_desc-uinfo.auth.icv_fullbits/8;
-   ahp-icv_trunc_len = aalg_desc-uinfo.auth.icv_truncbits/8;
+   ahp-icv_trunc_len = x-aalg-alg_trunc_len/8;
 
BUG_ON(ahp-icv_trunc_len  MAX_AH_AUTH_LEN);
 
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 12f7287..1948895 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -530,7 +530,7 @@ static int esp_init_authenc(struct xfrm_state *x)
}
 
err = crypto_aead_setauthsize(
-   aead, aalg_desc-uinfo.auth.icv_truncbits / 8);
+   aead, x-aalg-alg_trunc_len / 8);
if (err)
goto free_key;
}
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index c1589e2..0c2ae68 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -469,7 +469,7 @@ static int ah6_init_state(struct xfrm_state *x)
}
 
ahp-icv_full_len = aalg_desc-uinfo.auth.icv_fullbits/8;
-   ahp-icv_trunc_len = aalg_desc-uinfo.auth.icv_truncbits/8;
+   ahp-icv_trunc_len = x-aalg-alg_trunc_len/8;
 
BUG_ON(ahp-icv_trunc_len  MAX_AH_AUTH_LEN);
 
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index af597c7..668a46b 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -473,7 +473,7 @@ static int esp_init_authenc(struct xfrm_state *x)
}
 
err = crypto_aead_setauthsize(
-   aead, aalg_desc-uinfo.auth.icv_truncbits / 8);
+   aead, x-aalg-alg_trunc_len / 8);
if (err)
goto free_key;
}
-- 
1.6.3.3

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] xfrm: Custom truncation lengths for authentication algorithms

2009-11-25 Thread Martin Willi
The following patchset adds support for defining truncation lengths
for authentication algorithms in userspace. The main purpose for this
is to support SHA256 in IPsec using the standardized 128 bit
instead of the currently used 96 bit truncation.

Martin Willi (3):
  xfrm: Define new XFRM netlink auth attribute with specified
truncation bits
  xfrm: Store aalg in xfrm_state with a user specified truncation
length
  xfrm: Use the user specified truncation length in ESP and AH

 include/linux/xfrm.h  |8 +++
 include/net/xfrm.h|   12 -
 net/ipv4/ah4.c|2 +-
 net/ipv4/esp4.c   |2 +-
 net/ipv6/ah6.c|2 +-
 net/ipv6/esp6.c   |2 +-
 net/xfrm/xfrm_state.c |2 +-
 net/xfrm/xfrm_user.c  |  129 ++---
 8 files changed, 145 insertions(+), 14 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] xfrm: Store aalg in xfrm_state with a user specified truncation length

2009-11-25 Thread Martin Willi
Adding a xfrm_state requires an authentication algorithm specified
either as xfrm_algo or as xfrm_algo_auth with a specific truncation
length. For compatibility, both attributes are dumped to userspace,
and we also accept both attributes, but prefer the new syntax.

If no truncation length is specified, or the authentication algorithm
is specified using xfrm_algo, the truncation length from the algorithm
description in the kernel is used.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 include/net/xfrm.h|   12 -
 net/xfrm/xfrm_state.c |2 +-
 net/xfrm/xfrm_user.c  |  129 ++---
 3 files changed, 133 insertions(+), 10 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 223e90a..762327d 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -160,7 +160,7 @@ struct xfrm_state
struct xfrm_lifetime_cfg lft;
 
/* Data for transformer */
-   struct xfrm_algo*aalg;
+   struct xfrm_algo_auth   *aalg;
struct xfrm_algo*ealg;
struct xfrm_algo*calg;
struct xfrm_algo_aead   *aead;
@@ -1541,12 +1541,22 @@ static inline int xfrm_alg_len(struct xfrm_algo *alg)
return sizeof(*alg) + ((alg-alg_key_len + 7) / 8);
 }
 
+static inline int xfrm_alg_auth_len(struct xfrm_algo_auth *alg)
+{
+   return sizeof(*alg) + ((alg-alg_key_len + 7) / 8);
+}
+
 #ifdef CONFIG_XFRM_MIGRATE
 static inline struct xfrm_algo *xfrm_algo_clone(struct xfrm_algo *orig)
 {
return kmemdup(orig, xfrm_alg_len(orig), GFP_KERNEL);
 }
 
+static inline struct xfrm_algo_auth *xfrm_algo_auth_clone(struct 
xfrm_algo_auth *orig)
+{
+   return kmemdup(orig, xfrm_alg_auth_len(orig), GFP_KERNEL);
+}
+
 static inline void xfrm_states_put(struct xfrm_state **states, int n)
 {
int i;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index f2f7c63..67121ce 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1110,7 +1110,7 @@ static struct xfrm_state *xfrm_state_clone(struct 
xfrm_state *orig, int *errp)
x-props.saddr = orig-props.saddr;
 
if (orig-aalg) {
-   x-aalg = xfrm_algo_clone(orig-aalg);
+   x-aalg = xfrm_algo_auth_clone(orig-aalg);
if (!x-aalg)
goto error;
}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b95a2d6..fb42d77 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -62,6 +62,22 @@ static int verify_one_alg(struct nlattr **attrs, enum 
xfrm_attr_type_t type)
return 0;
 }
 
+static int verify_auth_trunc(struct nlattr **attrs)
+{
+   struct nlattr *rt = attrs[XFRMA_ALG_AUTH_TRUNC];
+   struct xfrm_algo_auth *algp;
+
+   if (!rt)
+   return 0;
+
+   algp = nla_data(rt);
+   if (nla_len(rt)  xfrm_alg_auth_len(algp))
+   return -EINVAL;
+
+   algp-alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0';
+   return 0;
+}
+
 static int verify_aead(struct nlattr **attrs)
 {
struct nlattr *rt = attrs[XFRMA_ALG_AEAD];
@@ -128,7 +144,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
err = -EINVAL;
switch (p-id.proto) {
case IPPROTO_AH:
-   if (!attrs[XFRMA_ALG_AUTH]  ||
+   if ((!attrs[XFRMA_ALG_AUTH] 
+!attrs[XFRMA_ALG_AUTH_TRUNC]) ||
attrs[XFRMA_ALG_AEAD]   ||
attrs[XFRMA_ALG_CRYPT]  ||
attrs[XFRMA_ALG_COMP])
@@ -139,10 +156,12 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
if (attrs[XFRMA_ALG_COMP])
goto out;
if (!attrs[XFRMA_ALG_AUTH] 
+   !attrs[XFRMA_ALG_AUTH_TRUNC] 
!attrs[XFRMA_ALG_CRYPT] 
!attrs[XFRMA_ALG_AEAD])
goto out;
if ((attrs[XFRMA_ALG_AUTH] ||
+attrs[XFRMA_ALG_AUTH_TRUNC] ||
 attrs[XFRMA_ALG_CRYPT]) 
attrs[XFRMA_ALG_AEAD])
goto out;
@@ -152,6 +171,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
if (!attrs[XFRMA_ALG_COMP]  ||
attrs[XFRMA_ALG_AEAD]   ||
attrs[XFRMA_ALG_AUTH]   ||
+   attrs[XFRMA_ALG_AUTH_TRUNC] ||
attrs[XFRMA_ALG_CRYPT])
goto out;
break;
@@ -161,6 +181,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
case IPPROTO_ROUTING:
if (attrs[XFRMA_ALG_COMP]   ||
attrs[XFRMA_ALG_AUTH]   ||
+   attrs[XFRMA_ALG_AUTH_TRUNC] ||
attrs[XFRMA_ALG_AEAD]   ||
attrs[XFRMA_ALG_CRYPT]  ||
attrs[XFRMA_ENCAP]  ||
@@ -176,6 +197,8 @@ static int verify_newsa_info(struct

[PATCH 1/3] xfrm: Define new XFRM netlink auth attribute with specified truncation bits

2009-11-25 Thread Martin Willi
The new XFRMA_ALG_AUTH_TRUNC attribute taking a xfrm_algo_auth as
argument allows the installation of authentication algorithms with
a truncation length specified in userspace, i.e. SHA256 with 128 bit
instead of 96 bit truncation.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 include/linux/xfrm.h |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 2d4ec15..d28e853 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -96,6 +96,13 @@ struct xfrm_algo {
charalg_key[0];
 };
 
+struct xfrm_algo_auth {
+   charalg_name[64];
+   unsigned intalg_key_len;/* in bits */
+   unsigned intalg_trunc_len;  /* in bits */
+   charalg_key[0];
+};
+
 struct xfrm_algo_aead {
charalg_name[64];
unsigned intalg_key_len;/* in bits */
@@ -283,6 +290,7 @@ enum xfrm_attr_type_t {
XFRMA_MIGRATE,
XFRMA_ALG_AEAD, /* struct xfrm_algo_aead */
XFRMA_KMADDRESS,/* struct xfrm_user_kmaddress */
+   XFRMA_ALG_AUTH_TRUNC,   /* struct xfrm_algo_auth */
__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
-- 
1.6.3.3

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] xfrm: Add SHA384 and SHA512 HMAC authentication algorithms to XFRM

2009-11-25 Thread Martin Willi
These algorithms use a truncation of 192/256 bits, as specified
in RFC4868.

Signed-off-by: Martin Willi mar...@strongswan.org
---
 net/xfrm/xfrm_algo.c |   34 ++
 1 files changed, 34 insertions(+), 0 deletions(-)

diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index faf54c6..480afda 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -200,6 +200,40 @@ static struct xfrm_algo_desc aalg_list[] = {
}
 },
 {
+   .name = hmac(sha384),
+
+   .uinfo = {
+   .auth = {
+   .icv_truncbits = 192,
+   .icv_fullbits = 384,
+   }
+   },
+
+   .desc = {
+   .sadb_alg_id = SADB_X_AALG_SHA2_384HMAC,
+   .sadb_alg_ivlen = 0,
+   .sadb_alg_minbits = 384,
+   .sadb_alg_maxbits = 384
+   }
+},
+{
+   .name = hmac(sha512),
+
+   .uinfo = {
+   .auth = {
+   .icv_truncbits = 256,
+   .icv_fullbits = 512,
+   }
+   },
+
+   .desc = {
+   .sadb_alg_id = SADB_X_AALG_SHA2_512HMAC,
+   .sadb_alg_ivlen = 0,
+   .sadb_alg_minbits = 512,
+   .sadb_alg_maxbits = 512
+   }
+},
+{
.name = hmac(rmd160),
.compat = rmd160,
 
-- 
1.6.3.3

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: HMAC regression

2009-05-31 Thread Martin Willi
 You must getting an sg entry that crosses a page boundary, rather than
 two sg entries that both stay within a page.

Yes.

 These things are very rare, and usually occurs as
 a result of SLAB debugging causing kmalloc to return memory that
 crosses page boundaries.

Indeed, SLAB_DEBUG was enabled in my config. Disabling it resolves this
issue.

 Can you see if this patch fixes the problem?

Yes, it fixes HMAC calculation with enabled SLAB debugging.

Thanks for your help!

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


HMAC regression

2009-05-28 Thread Martin Willi
Hi,

Switching the hash implementations to the new shash API introduced a
regression. HMACs are created incorrectly if the data is scattered over
multiple pages, resulting in very unreliable IPsec tunnels.

The appended patch adds a silly hmac(sha1) test vector larger than a 4KB
page and fails on current crypto-2.6. It runs successful when reverting
sha1 to the old hash API (54ccb367).

Data on the first page gets hashed correctly, but walking to the next
page fails. The new page does not get mapped and the remaining bytes are
read from the beginning of the first page, resulting in wrong MACs. I
did not fully understand the hash_walk code in ahash.c, but either the
length/offset calculation is wrong (when using compat functions) or the
scatterlist is not set up correctly.

Martin

---
 crypto/testmgr.h |  252 +-
 1 files changed, 250 insertions(+), 2 deletions(-)

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 526f00a..ab90006 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -32,7 +32,7 @@ struct hash_testvec {
char *plaintext;
char *digest;
unsigned char tap[MAX_TAP];
-   unsigned char psize;
+   unsigned int psize;
unsigned char np;
unsigned char ksize;
 };
@@ -1238,7 +1238,7 @@ static struct hash_testvec hmac_rmd160_tv_template[] = {
 /*
  * HMAC-SHA1 test vectors from RFC2202
  */
-#define HMAC_SHA1_TEST_VECTORS 7
+#define HMAC_SHA1_TEST_VECTORS 8
 
 static struct hash_testvec hmac_sha1_tv_template[] = {
{
@@ -1314,6 +1314,254 @@ static struct hash_testvec hmac_sha1_tv_template[] = {
.psize  = 73,
.digest = \xe8\xe9\x9d\x0f\x45\x23\x7d\x78\x6d\x6b
  \xba\xa7\x96\x5c\x78\x08\xbb\xff\x1a\x91,
+   }, {
+   .key= 
\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c,
+   .ksize  = 20,
+   .plaintext =
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90
+
+\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90\x12\x34\x56\x78\x90