[PATCH v2 4/5] crypto: AES CBC by8 encryption

2015-10-29 Thread Tim Chen

This patch introduces the assembly routine to do a by8 AES CBC encryption
in support of the AES CBC multi-buffer implementation.

Encryption of 8 data streams of a key size are done simultaneously.

Originally-by: Chandramouli Narayanan 
Signed-off-by: Tim Chen 
---
 arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S | 774 
 1 file changed, 774 insertions(+)
 create mode 100644 arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S

diff --git a/arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S 
b/arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S
new file mode 100644
index 000..eaffc28
--- /dev/null
+++ b/arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S
@@ -0,0 +1,774 @@
+/*
+ * AES CBC by8 multibuffer optimization (x86_64)
+ * This file implements 128/192/256 bit AES CBC encryption
+ *
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford 
+ * Sean Gulley 
+ * Tim Chen 
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include 
+
+/* stack size needs to be an odd multiple of 8 for alignment */
+
+#define AES_KEYSIZE_12816
+#define AES_KEYSIZE_19224
+#define AES_KEYSIZE_25632
+
+#define XMM_SAVE_SIZE  16*10
+#define GPR_SAVE_SIZE  8*9
+#define STACK_SIZE (XMM_SAVE_SIZE + GPR_SAVE_SIZE)
+
+#define GPR_SAVE_REG   %rsp
+#define GPR_SAVE_AREA  %rsp + XMM_SAVE_SIZE
+#define LEN_AREA_OFFSETXMM_SAVE_SIZE + 8*8
+#define LEN_AREA_REG   %rsp
+#define LEN_AREA   %rsp + XMM_SAVE_SIZE + 8*8
+
+#define IN_OFFSET  0
+#define OUT_OFFSET 8*8
+#define KEYS_OFFSET16*8
+#define IV_OFFSET  24*8
+
+
+#define IDX%rax
+#define TMP%rbx
+#define ARG%rdi
+#define LEN%rsi
+
+#define KEYS0  %r14
+#define KEYS1  %r15
+#define KEYS2  %rbp
+#define KEYS3  %rdx
+#define KEYS4  %rcx
+#define KEYS5  %r8
+#define KEYS6  %r9
+#define KEYS7  %r10
+
+#define IN0%r11
+#define IN2%r12
+#define IN4%r13
+#define IN6LEN
+
+#define XDATA0 %xmm0
+#define XDATA1 %xmm1
+#define XDATA2 %xmm2
+#define XDATA3 %xmm3
+#define XDATA4 %xmm4
+#define XDATA5 %xmm5
+#define XDATA6 %xmm6
+#define XDATA7 %xmm7
+
+#define XKEY0_3%xmm8
+#define XKEY1_4%xmm9
+#define XKEY2_5%xmm10
+#define XKEY3_6%xmm11
+#define XKEY4_7%xmm12
+#define XKEY5_8%xmm13
+#define XKEY6_9%xmm14
+#define XTMP   %xmm15
+
+#defineMOVDQ movdqu /* assume buffers not aligned */
+#define CONCAT(a, b)   a##b
+#define INPUT_REG_SUFX 1   /* IN */
+#define XDATA_REG_SUFX 2   /* XDAT */
+#define KEY_REG_SUFX   3   /* KEY */
+#define XMM_REG_SUFX   4   /* XMM */
+
+/*
+ * To avoid positional parameter errors while compiling
+ * three registers need to be passed
+ */
+.text
+
+.macro pxor2 x, y, z
+   MOVDQ   (\x,\y), XTMP
+   pxorXTMP, \z
+.endm
+
+.macro inreg n

[PATCH v2 2/5] crypto: AES CBC multi-buffer data structures

2015-10-29 Thread Tim Chen

This patch introduces the data structures and prototypes of functions
needed for doing AES CBC encryption using multi-buffer. Included are
the structures of the multi-buffer AES CBC job, job scheduler in C and
data structure defines in x86 assembly code.

Originally-by: Chandramouli Narayanan 
Signed-off-by: Tim Chen 
---
 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_ctx.h|  96 +
 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_mgr.h| 131 
 arch/x86/crypto/aes-cbc-mb/mb_mgr_datastruct.S | 270 +
 arch/x86/crypto/aes-cbc-mb/reg_sizes.S | 125 
 4 files changed, 622 insertions(+)
 create mode 100644 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_ctx.h
 create mode 100644 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_mgr.h
 create mode 100644 arch/x86/crypto/aes-cbc-mb/mb_mgr_datastruct.S
 create mode 100644 arch/x86/crypto/aes-cbc-mb/reg_sizes.S

diff --git a/arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_ctx.h 
b/arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_ctx.h
new file mode 100644
index 000..5493f83
--- /dev/null
+++ b/arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_ctx.h
@@ -0,0 +1,96 @@
+/*
+ * Header file for multi buffer AES CBC algorithm manager
+ * that deals with 8 buffers at a time
+ *
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford 
+ * Sean Gulley 
+ * Tim Chen 
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef __AES_CBC_MB_CTX_H
+#define __AES_CBC_MB_CTX_H
+
+
+#include 
+
+#include "aes_cbc_mb_mgr.h"
+
+#define CBC_ENCRYPT0x01
+#define CBC_DECRYPT0x02
+#define CBC_START  0x04
+#define CBC_DONE   0x08
+
+#define CBC_CTX_STS_IDLE   0x00
+#define CBC_CTX_STS_PROCESSING 0x01
+#define CBC_CTX_STS_LAST   0x02
+#define CBC_CTX_STS_COMPLETE   0x04
+
+enum cbc_ctx_error {
+   CBC_CTX_ERROR_NONE   =  0,
+   CBC_CTX_ERROR_INVALID_FLAGS  = -1,
+   CBC_CTX_ERROR_ALREADY_PROCESSING = -2,
+   CBC_CTX_ERROR_ALREADY_COMPLETED  = -3,
+};
+
+#define cbc_ctx_init(ctx, nbytes, op) \
+   do { \
+   (ctx)->flag = (op) | CBC_START; \
+   (ctx)->nbytes = nbytes; \
+   } while (0)
+
+/* AESNI routines to perform cbc decrypt and key expansion */
+
+asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+unsigned int key_len);
+
+#endif /* __AES_CBC_MB_CTX_H */
diff --git a/arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_mgr.h 
b/arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_mgr.h
new file mode 100644
index 000..0def82e
--- /dev/null
+++ b/arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_mgr.h
@@ -0,0 +1,131 @@
+/*
+ * Header file for multi buffer AES CBC algorithm manager
+ *

[PATCH v2 1/5] crypto: Multi-buffer encryptioin infrastructure support

2015-10-29 Thread Tim Chen

In this patch, the infrastructure needed to support multibuffer
encryption implementation is added:

a) Enhace mcryptd daemon to support blkcipher requests.

b) Update configuration to include multi-buffer encryption build support.

c) Add support to crypto scatterwalk support that can sleep during
encryption operation, as we may have buffers for jobs in data lanes
that are half-finished, waiting for additional jobs to come to fill
empty lanes before we start the encryption again.  Therefore, we need to
enhance crypto walk with the option to map data buffers non-atomically.
This is done by algorithms run from crypto daemon who knows it is safe
to do so as it can save and restore FPU state in correct context.

For an introduction to the multi-buffer implementation, please see
http://www.intel.com/content/www/us/en/communications/communications-ia-multi-buffer-paper.html

Originally-by: Chandramouli Narayanan 
Signed-off-by: Tim Chen 
---
 crypto/Kconfig   |  16 +++
 crypto/blkcipher.c   |  29 -
 crypto/mcryptd.c | 256 ++-
 crypto/scatterwalk.c |   7 ++
 include/crypto/algapi.h  |   1 +
 include/crypto/mcryptd.h |  36 ++
 include/crypto/scatterwalk.h |   6 +
 include/linux/crypto.h   |   1 +
 8 files changed, 347 insertions(+), 5 deletions(-)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 7240821..6b51084 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -888,6 +888,22 @@ config CRYPTO_AES_NI_INTEL
  ECB, CBC, LRW, PCBC, XTS. The 64 bit version has additional
  acceleration for CTR.
 
+config CRYPTO_AES_CBC_MB
+   tristate "AES CBC algorithm (x86_64 Multi-Buffer, Experimental)"
+   depends on X86 && 64BIT
+   select CRYPTO_ABLK_HELPER
+   select CRYPTO_MCRYPTD
+   help
+ AES CBC encryption implemented using multi-buffer technique.
+ This algorithm computes on multiple data lanes concurrently with
+ SIMD instructions for better throughput.  It should only be
+ used when there is significant work to generate many separate
+ crypto requests that keep all the data lanes filled to get
+ the performance benefit.  If the data lanes are unfilled, a
+ flush operation will be initiated after some delay to process
+ the exisiting crypto jobs, adding some extra latency at low
+ load case.
+
 config CRYPTO_AES_SPARC64
tristate "AES cipher algorithms (SPARC64)"
depends on SPARC64
diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c
index 11b9814..9fd4028 100644
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -35,6 +35,9 @@ enum {
BLKCIPHER_WALK_SLOW = 1 << 1,
BLKCIPHER_WALK_COPY = 1 << 2,
BLKCIPHER_WALK_DIFF = 1 << 3,
+   /* deal with scenarios where we can sleep during sg walk */
+   /* when we process part of a request */
+   BLKCIPHER_WALK_MAY_SLEEP = 1 << 4,
 };
 
 static int blkcipher_walk_next(struct blkcipher_desc *desc,
@@ -44,22 +47,38 @@ static int blkcipher_walk_first(struct blkcipher_desc *desc,
 
 static inline void blkcipher_map_src(struct blkcipher_walk *walk)
 {
-   walk->src.virt.addr = scatterwalk_map(&walk->in);
+   /* add support for asynchronous requests which need no atomic map */
+   if (walk->flags & BLKCIPHER_WALK_MAY_SLEEP)
+   walk->src.virt.addr = scatterwalk_map_nonatomic(&walk->in);
+   else
+   walk->src.virt.addr = scatterwalk_map(&walk->in);
 }
 
 static inline void blkcipher_map_dst(struct blkcipher_walk *walk)
 {
-   walk->dst.virt.addr = scatterwalk_map(&walk->out);
+   /* add support for asynchronous requests which need no atomic map */
+   if (walk->flags & BLKCIPHER_WALK_MAY_SLEEP)
+   walk->dst.virt.addr = scatterwalk_map_nonatomic(&walk->out);
+   else
+   walk->dst.virt.addr = scatterwalk_map(&walk->out);
 }
 
 static inline void blkcipher_unmap_src(struct blkcipher_walk *walk)
 {
-   scatterwalk_unmap(walk->src.virt.addr);
+   /* add support for asynchronous requests which need no atomic map */
+   if (walk->flags & BLKCIPHER_WALK_MAY_SLEEP)
+   scatterwalk_unmap_nonatomic(walk->src.virt.addr);
+   else
+   scatterwalk_unmap(walk->src.virt.addr);
 }
 
 static inline void blkcipher_unmap_dst(struct blkcipher_walk *walk)
 {
-   scatterwalk_unmap(walk->dst.virt.addr);
+   /* add support for asynchronous requests which need no atomic map */
+   if (walk->flags & BLKCIPHER_WALK_MAY_SLEEP)
+   scatterwalk_unmap_nonatomic(walk->dst.virt.addr);
+   else
+   scatterwalk_unmap(walk->dst.virt.addr);
 }
 
 /* Get a spot of the specified length that does not straddle a page.
@@ -299,6 +318,8 @@ static inline int blkcipher_copy_iv(struct blkcipher_walk 
*walk)
 int blkcipher_walk_virt(struct blkcipher_desc *desc,
struct blkcipher_walk *walk)
 {

[PATCH v2 5/5] crypto: AES CBC multi-buffer glue code

2015-10-29 Thread Tim Chen

This patch introduces the multi-buffer job manager which is responsible
for submitting scatter-gather buffers from several AES CBC jobs
to the multi-buffer algorithm. The glue code interfaces with the
underlying algorithm that handles 8 data streams of AES CBC encryption
in parallel. AES key expansion and CBC decryption requests are performed
in a manner similar to the existing AESNI Intel glue driver.

The outline of the algorithm for AES CBC encryption requests is
sketched below:

Any driver requesting the crypto service will place an async crypto
request on the workqueue.  The multi-buffer crypto daemon will pull an
AES CBC encryption request from work queue and put each request in an
empty data lane for multi-buffer crypto computation.  When all the empty
lanes are filled, computation will commence on the jobs in parallel and
the job with the shortest remaining buffer will get completed and be
returned. To prevent prolonged stall, when no new jobs arrive, we will
flush workqueue of jobs after a maximum allowable delay has elapsed.

To accommodate the fragmented nature of scatter-gather, we will keep
submitting the next scatter-buffer fragment for a job for multi-buffer
computation until a job is completed and no more buffer fragments remain.
At that time we will pull a new job to fill the now empty data slot.
We check with the multibuffer scheduler to see if there are other
completed jobs to prevent extraneous delay in returning any completed
jobs.

This multi-buffer algorithm should be used for cases where we get at
least 8 streams of crypto jobs submitted at a reasonably high rate.
For low crypto job submission rate and low number of data streams, this
algorithm will not be beneficial. The reason is at low rate, we do not
fill out the data lanes before flushing the jobs instead of processing
them with all the data lanes full.  We will miss the benefit of parallel
computation, and adding delay to the processing of the crypto job at the
same time.  Some tuning of the maximum latency parameter may be needed
to get the best performance.

Originally-by: Chandramouli Narayanan 
Signed-off-by: Tim Chen 
---
 arch/x86/crypto/Makefile|   1 +
 arch/x86/crypto/aes-cbc-mb/Makefile |  22 +
 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb.c | 812 
 3 files changed, 835 insertions(+)
 create mode 100644 arch/x86/crypto/aes-cbc-mb/Makefile
 create mode 100644 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index b9b912a..000db49 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
 obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
+obj-$(CONFIG_CRYPTO_AES_CBC_MB) += aes-cbc-mb/
 obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
 
 # These modules require assembler to support AVX.
diff --git a/arch/x86/crypto/aes-cbc-mb/Makefile 
b/arch/x86/crypto/aes-cbc-mb/Makefile
new file mode 100644
index 000..b642bd8
--- /dev/null
+++ b/arch/x86/crypto/aes-cbc-mb/Makefile
@@ -0,0 +1,22 @@
+#
+# Arch-specific CryptoAPI modules.
+#
+
+avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
+
+# we need decryption and key expansion routine symbols
+# if either AESNI_NI_INTEL or AES_CBC_MB is a module
+
+ifeq ($(CONFIG_CRYPTO_AES_NI_INTEL),m)
+   dec_support := ../aesni-intel_asm.o
+endif
+ifeq ($(CONFIG_CRYPTO_AES_CBC_MB),m)
+   dec_support := ../aesni-intel_asm.o
+endif
+
+ifeq ($(avx_supported),yes)
+   obj-$(CONFIG_CRYPTO_AES_CBC_MB) += aes-cbc-mb.o
+   aes-cbc-mb-y := $(dec_support) aes_cbc_mb.o aes_mb_mgr_init.o \
+   mb_mgr_inorder_x8_asm.o mb_mgr_ooo_x8_asm.o \
+   aes_cbc_enc_x8.o
+endif
diff --git a/arch/x86/crypto/aes-cbc-mb/aes_cbc_mb.c 
b/arch/x86/crypto/aes-cbc-mb/aes_cbc_mb.c
new file mode 100644
index 000..6a03712
--- /dev/null
+++ b/arch/x86/crypto/aes-cbc-mb/aes_cbc_mb.c
@@ -0,0 +1,812 @@
+/*
+ * Multi buffer AES CBC algorithm glue code
+ *
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford 
+ * Sean Gulley 
+ * Tim Chen 
+ */
+
+#define pr_fmt(fmt)KBUILD_MODNAME "

[PATCH v2 3/5] crypto: AES CBC multi-buffer scheduler

2015-10-29 Thread Tim Chen

This patch implements in-order scheduler for encrypting multiple buffers
in parallel supporting AES CBC encryption with key sizes of
128, 192 and 256 bits. It uses 8 data lanes by taking advantage of the
SIMD instructions with XMM registers.

The multibuffer manager and scheduler is mostly written in assembly and
the initialization support is written C. The AES CBC multibuffer crypto
driver support interfaces with the multibuffer manager and scheduler
to support AES CBC encryption in parallel. The scheduler supports
job submissions, job flushing and and job retrievals after completion.

The basic flow of usage of the CBC multibuffer scheduler is as follows:

- The caller allocates an aes_cbc_mb_mgr_inorder_x8 object
and initializes it once by calling aes_cbc_init_mb_mgr_inorder_x8().

- The aes_cbc_mb_mgr_inorder_x8 structure has an array of JOB_AES
objects. Allocation and scheduling of JOB_AES objects are managed
by the multibuffer scheduler support routines. The caller allocates
a JOB_AES using aes_cbc_get_next_job_inorder_x8().

- The returned JOB_AES must be filled in with parameters for CBC
encryption (eg: plaintext buffer, ciphertext buffer, key, iv, etc) and
submitted to the manager object using aes_cbc_submit_job_inorder_xx().

- If the oldest JOB_AES is completed during a call to
aes_cbc_submit_job_inorder_x8(), it is returned. Otherwise,
NULL is returned.

- A call to aes_cbc_flush_job_inorder_x8() always returns the
oldest job, unless the multibuffer manager is empty of jobs.

- A call to aes_cbc_get_completed_job_inorder_x8() returns
a completed job. This routine is useful to process completed
jobs instead of waiting for the flusher to engage.

- When a job is returned from submit or flush, the caller extracts
the useful data and returns it to the multibuffer manager implicitly
by the next call to aes_cbc_get_next_job_xx().

Jobs are always returned from submit or flush routines in the order they
were submitted (hence "inorder").A job allocated using
aes_cbc_get_next_job_inorder_x8() must be filled in and submitted before
another call. A job returned by aes_cbc_submit_job_inorder_x8() or
aes_cbc_flush_job_inorder_x8() is 'deallocated' upon the next call to
get a job structure. Calls to get_next_job() cannot fail. If all jobs are
allocated after a call to get_next_job(), the subsequent call to submit
always returns the oldest job in a completed state.

Originally-by: Chandramouli Narayanan 
Signed-off-by: Tim Chen 
---
 arch/x86/crypto/aes-cbc-mb/aes_mb_mgr_init.c   | 145 +++
 arch/x86/crypto/aes-cbc-mb/mb_mgr_inorder_x8_asm.S | 222 +++
 arch/x86/crypto/aes-cbc-mb/mb_mgr_ooo_x8_asm.S | 416 +
 3 files changed, 783 insertions(+)
 create mode 100644 arch/x86/crypto/aes-cbc-mb/aes_mb_mgr_init.c
 create mode 100644 arch/x86/crypto/aes-cbc-mb/mb_mgr_inorder_x8_asm.S
 create mode 100644 arch/x86/crypto/aes-cbc-mb/mb_mgr_ooo_x8_asm.S

diff --git a/arch/x86/crypto/aes-cbc-mb/aes_mb_mgr_init.c 
b/arch/x86/crypto/aes-cbc-mb/aes_mb_mgr_init.c
new file mode 100644
index 000..7a7f8a1
--- /dev/null
+++ b/arch/x86/crypto/aes-cbc-mb/aes_mb_mgr_init.c
@@ -0,0 +1,145 @@
+/*
+ * Initialization code for multi buffer AES CBC algorithm
+ *
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford 
+ * Sean Gulley 
+ * Tim Chen 
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE

[PATCH v2 0/5] crypto: x86 AES-CBC encryption with multibuffer

2015-10-29 Thread Tim Chen

In this patch series, we introduce AES CBC encryption that is parallelized
on x86_64 cpu with XMM registers. The multi-buffer technique encrypt 8
data streams in parallel with SIMD instructions. Decryption is handled
as in the existing AESNI Intel CBC implementation which can already
parallelize decryption even for a single data stream.

Please see the multi-buffer whitepaper for details of the technique:
http://www.intel.com/content/www/us/en/communications/communications-ia-multi-buffer-paper.html

It is important that any driver uses this algorithm properly for scenarios
where we have many data streams that can fill up the data lanes most of
the time.  It shouldn't be used when only a single data stream is expected
mostly. Otherwise we may incurr extra delays when we have frequent gaps in
data lanes, causing us to wait till data come in to fill the data lanes
before initiating encryption.  We may have to wait for flush operations
to commence when no new data come in after some wait time. However we
keep this extra delay to a minimum by opportunistically flushing the
unfinished jobs if crypto daemon is the only active task running on a cpu.

By using this technique, we saw a throughput increase of up to
5.8x under optimal conditions when we have fully loaded encryption jobs
filling up all the data lanes.

Change Log:
v2
1. Update cpu feature check to make sure SSE is supported
2. Fix up unloading of aes-cbc-mb module to properly free memory

Tim Chen (5):
  crypto: Multi-buffer encryptioin infrastructure support
  crypto: AES CBC multi-buffer data structures
  crypto: AES CBC multi-buffer scheduler
  crypto: AES CBC by8 encryption
  crypto: AES CBC multi-buffer glue code

 arch/x86/crypto/Makefile   |   1 +
 arch/x86/crypto/aes-cbc-mb/Makefile|  22 +
 arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S| 774 
 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb.c| 812 +
 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_ctx.h|  96 +++
 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_mgr.h| 131 
 arch/x86/crypto/aes-cbc-mb/aes_mb_mgr_init.c   | 145 
 arch/x86/crypto/aes-cbc-mb/mb_mgr_datastruct.S | 270 +++
 arch/x86/crypto/aes-cbc-mb/mb_mgr_inorder_x8_asm.S | 222 ++
 arch/x86/crypto/aes-cbc-mb/mb_mgr_ooo_x8_asm.S | 416 +++
 arch/x86/crypto/aes-cbc-mb/reg_sizes.S | 125 
 crypto/Kconfig |  16 +
 crypto/blkcipher.c |  29 +-
 crypto/mcryptd.c   | 256 ++-
 crypto/scatterwalk.c   |   7 +
 include/crypto/algapi.h|   1 +
 include/crypto/mcryptd.h   |  36 +
 include/crypto/scatterwalk.h   |   6 +
 include/linux/crypto.h |   1 +
 19 files changed, 3361 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/crypto/aes-cbc-mb/Makefile
 create mode 100644 arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S
 create mode 100644 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb.c
 create mode 100644 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_ctx.h
 create mode 100644 arch/x86/crypto/aes-cbc-mb/aes_cbc_mb_mgr.h
 create mode 100644 arch/x86/crypto/aes-cbc-mb/aes_mb_mgr_init.c
 create mode 100644 arch/x86/crypto/aes-cbc-mb/mb_mgr_datastruct.S
 create mode 100644 arch/x86/crypto/aes-cbc-mb/mb_mgr_inorder_x8_asm.S
 create mode 100644 arch/x86/crypto/aes-cbc-mb/mb_mgr_ooo_x8_asm.S
 create mode 100644 arch/x86/crypto/aes-cbc-mb/reg_sizes.S

-- 
1.7.11.7


--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/5] crypto: AES CBC multi-buffer glue code

2015-10-29 Thread Tim Chen
On Thu, 2015-10-29 at 09:19 -0700, Tim Chen wrote:
> On Thu, 2015-10-29 at 03:03 +0100, Stephan Mueller wrote:
> > Am Mittwoch, 28. Oktober 2015, 14:19:29 schrieb Tim Chen:
> > 
> > Hi Tim,
> > 
> > >+
> > >+  /* check for dependent cpu features */
> > >+  if (!cpu_has_aes) {
> > >+  pr_err("aes_cbc_mb_mod_init: no aes support\n");
> > >+  err = -ENODEV;
> > >+  goto err1;
> > >+  }
> > 
> > In your post 0/5, you say that this mechanism needs AVX2. In the existing 
> > AESNI glue code I find
> > 
> > #ifdef CONFIG_X86_64
> > #ifdef CONFIG_AS_AVX2
> > if (boot_cpu_has(X86_FEATURE_AVX2)) {
> > 
> > ...
> > 
> > Why would that CPU check not be needed here?
> 
> Good catch.  Will add check for avx2.

Actually since we are using only XMM registers and SSE instructions,
checking for SSE support will be sufficient.

Tim

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/5] crypto: AES CBC multi-buffer glue code

2015-10-29 Thread Tim Chen
On Thu, 2015-10-29 at 03:03 +0100, Stephan Mueller wrote:
> Am Mittwoch, 28. Oktober 2015, 14:19:29 schrieb Tim Chen:
> 
> Hi Tim,
> 
> >+
> >+/* check for dependent cpu features */
> >+if (!cpu_has_aes) {
> >+pr_err("aes_cbc_mb_mod_init: no aes support\n");
> >+err = -ENODEV;
> >+goto err1;
> >+}
> 
> In your post 0/5, you say that this mechanism needs AVX2. In the existing 
> AESNI glue code I find
> 
> #ifdef CONFIG_X86_64
> #ifdef CONFIG_AS_AVX2
> if (boot_cpu_has(X86_FEATURE_AVX2)) {
> 
> ...
> 
> Why would that CPU check not be needed here?

Good catch.  Will add check for avx2.

Tim

> 
> Ciao
> Stephan


--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] crypto: x86: Remove duplicate define of SHA1_DIGEST_SIZE

2015-10-29 Thread Tim Chen
On Thu, 2015-10-29 at 08:51 +0100, LABBE Corentin wrote:
> On Fri, Oct 16, 2015 at 09:04:58AM -0700, Tim Chen wrote:
> > On Wed, 2015-10-14 at 21:15 +0200, LABBE Corentin wrote:
> > > The sha x86 crypto code use two define for the same thing:
> > > NUM_SHA1_DIGEST_WORDS and SHA1_DIGEST_LENGTH
> > > Replace them by SHA1_DIGEST_SIZE/4
> > 
> > Thanks.  Acked-by: Tim Chen 
> > > 
> > > Signed-off-by: LABBE Corentin 
> > 
> 
> Hello
> 
> Thanks for your ack, but Thomas Gleixner in the same time NACK this patch.
> Just in case I attach the new patch, for permit you to decide which one you 
> prefer.
> 
> Regards

Looks fine.  But wonder if you should have
#define SHA1_DIGEST_WORDS (SHA1_DIGEST_SIZE / sizeof(u32))

moved to sha1.h

Tim
> 

> diff --git a/arch/x86/crypto/sha-mb/sha_mb_ctx.h 
> b/arch/x86/crypto/sha-mb/sha_mb_ctx.h
> index e36069d..9fd36eb5 100644
> --- a/arch/x86/crypto/sha-mb/sha_mb_ctx.h
> +++ b/arch/x86/crypto/sha-mb/sha_mb_ctx.h
> @@ -94,7 +94,6 @@ enum hash_ctx_error {
>  
> 
>  /* Hash Constants and Typedefs */
> -#define SHA1_DIGEST_LENGTH  5
>  #define SHA1_LOG2_BLOCK_SIZE6
>  
>  #define SHA1_PADLENGTHFIELD_SIZE8
> diff --git a/arch/x86/crypto/sha-mb/sha_mb_mgr.h 
> b/arch/x86/crypto/sha-mb/sha_mb_mgr.h
> index 08ad1a9..b295e15 100644
> --- a/arch/x86/crypto/sha-mb/sha_mb_mgr.h
> +++ b/arch/x86/crypto/sha-mb/sha_mb_mgr.h
> @@ -54,10 +54,10 @@
>  #ifndef __SHA_MB_MGR_H
>  #define __SHA_MB_MGR_H
>  
> -
> +#include 
>  #include 
>  
> -#define NUM_SHA1_DIGEST_WORDS 5
> +#define SHA1_DIGEST_WORDS (SHA1_DIGEST_SIZE / sizeof(u32))

Suggest to move SHA1_DIGEST_WORDS to sha1.h

Tim



--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v1 0/4] TPM2: select hash algorithm for a trusted key

2015-10-29 Thread Jarkko Sakkinen
Jarkko Sakkinen (4):
  crypto: add entry for sm3-256
  tpm: choose hash algorithm for sealing when using TPM 2.0
  keys, trusted: select the hash algorithm
  keys, trusted: update documentation for 'hash=' option

 Documentation/security/keys-trusted-encrypted.txt |  3 ++
 crypto/hash_info.c|  2 ++
 drivers/char/tpm/tpm.h| 10 --
 drivers/char/tpm/tpm2-cmd.c   | 42 +--
 include/crypto/hash_info.h|  3 ++
 include/keys/trusted-type.h   |  1 +
 include/uapi/linux/hash_info.h|  1 +
 security/keys/trusted.c   | 20 ++-
 8 files changed, 75 insertions(+), 7 deletions(-)

-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v1 1/4] crypto: add entry for sm3-256

2015-10-29 Thread Jarkko Sakkinen
Added entry for sm3-256 to the following tables:

* hash_algo_name
* hash_digest_size

Needed for TPM 2.0 trusted key sealing.

Signed-off-by: Jarkko Sakkinen 
---
 crypto/hash_info.c | 2 ++
 include/crypto/hash_info.h | 3 +++
 include/uapi/linux/hash_info.h | 1 +
 3 files changed, 6 insertions(+)

diff --git a/crypto/hash_info.c b/crypto/hash_info.c
index 3e7ff46..6f3a113 100644
--- a/crypto/hash_info.c
+++ b/crypto/hash_info.c
@@ -31,6 +31,7 @@ const char *const hash_algo_name[HASH_ALGO__LAST] = {
[HASH_ALGO_TGR_128] = "tgr128",
[HASH_ALGO_TGR_160] = "tgr160",
[HASH_ALGO_TGR_192] = "tgr192",
+   [HASH_ALGO_SM3_256] = "sm3-256",
 };
 EXPORT_SYMBOL_GPL(hash_algo_name);
 
@@ -52,5 +53,6 @@ const int hash_digest_size[HASH_ALGO__LAST] = {
[HASH_ALGO_TGR_128] = TGR128_DIGEST_SIZE,
[HASH_ALGO_TGR_160] = TGR160_DIGEST_SIZE,
[HASH_ALGO_TGR_192] = TGR192_DIGEST_SIZE,
+   [HASH_ALGO_SM3_256] = SM3_256_DIGEST_SIZE,
 };
 EXPORT_SYMBOL_GPL(hash_digest_size);
diff --git a/include/crypto/hash_info.h b/include/crypto/hash_info.h
index e1e5a3e..d86e050 100644
--- a/include/crypto/hash_info.h
+++ b/include/crypto/hash_info.h
@@ -34,6 +34,9 @@
 #define TGR160_DIGEST_SIZE 20
 #define TGR192_DIGEST_SIZE 24
 
+/* not defined in include/crypto/ */
+#define SM3_256_DIGEST_SIZE 32
+
 extern const char *const hash_algo_name[HASH_ALGO__LAST];
 extern const int hash_digest_size[HASH_ALGO__LAST];
 
diff --git a/include/uapi/linux/hash_info.h b/include/uapi/linux/hash_info.h
index ca18c45..ebf8fd8 100644
--- a/include/uapi/linux/hash_info.h
+++ b/include/uapi/linux/hash_info.h
@@ -31,6 +31,7 @@ enum hash_algo {
HASH_ALGO_TGR_128,
HASH_ALGO_TGR_160,
HASH_ALGO_TGR_192,
+   HASH_ALGO_SM3_256,
HASH_ALGO__LAST
 };
 
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] crypto: x86: Remove duplicate define of SHA1_DIGEST_SIZE

2015-10-29 Thread LABBE Corentin
On Fri, Oct 16, 2015 at 09:04:58AM -0700, Tim Chen wrote:
> On Wed, 2015-10-14 at 21:15 +0200, LABBE Corentin wrote:
> > The sha x86 crypto code use two define for the same thing:
> > NUM_SHA1_DIGEST_WORDS and SHA1_DIGEST_LENGTH
> > Replace them by SHA1_DIGEST_SIZE/4
> 
> Thanks.  Acked-by: Tim Chen 
> > 
> > Signed-off-by: LABBE Corentin 
> 

Hello

Thanks for your ack, but Thomas Gleixner in the same time NACK this patch.
Just in case I attach the new patch, for permit you to decide which one you 
prefer.

Regards

--8<--
>From 7439bc57d95de49a510c0eb5d328dec10a3c6689 Mon Sep 17 00:00:00 2001
From: LABBE Corentin 
Date: Wed, 14 Oct 2015 12:48:04 +0200
Subject: [PATCH] crypto: x86: Remove duplicate define of SHA1_DIGEST_SIZE

The sha x86 crypto code use two define for the same thing:
NUM_SHA1_DIGEST_WORDS and SHA1_DIGEST_LENGTH
Replace them by SHA1_DIGEST_SIZE/4

Signed-off-by: LABBE Corentin 
---
 arch/x86/crypto/sha-mb/sha1_mb.c| 2 +-
 arch/x86/crypto/sha-mb/sha_mb_ctx.h | 1 -
 arch/x86/crypto/sha-mb/sha_mb_mgr.h | 6 +++---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index a841e97..6544ea7 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -104,7 +104,7 @@ static asmlinkage struct job_sha1* 
(*sha1_job_mgr_get_comp_job)(struct sha1_mb_m
 
 inline void sha1_init_digest(uint32_t *digest)
 {
-   static const uint32_t initial_digest[SHA1_DIGEST_LENGTH] = {SHA1_H0,
+   static const uint32_t initial_digest[SHA1_DIGEST_WORDS] = {SHA1_H0,
SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 };
memcpy(digest, initial_digest, sizeof(initial_digest));
 }
diff --git a/arch/x86/crypto/sha-mb/sha_mb_ctx.h 
b/arch/x86/crypto/sha-mb/sha_mb_ctx.h
index e36069d..9fd36eb5 100644
--- a/arch/x86/crypto/sha-mb/sha_mb_ctx.h
+++ b/arch/x86/crypto/sha-mb/sha_mb_ctx.h
@@ -94,7 +94,6 @@ enum hash_ctx_error {
 
 
 /* Hash Constants and Typedefs */
-#define SHA1_DIGEST_LENGTH  5
 #define SHA1_LOG2_BLOCK_SIZE6
 
 #define SHA1_PADLENGTHFIELD_SIZE8
diff --git a/arch/x86/crypto/sha-mb/sha_mb_mgr.h 
b/arch/x86/crypto/sha-mb/sha_mb_mgr.h
index 08ad1a9..b295e15 100644
--- a/arch/x86/crypto/sha-mb/sha_mb_mgr.h
+++ b/arch/x86/crypto/sha-mb/sha_mb_mgr.h
@@ -54,10 +54,10 @@
 #ifndef __SHA_MB_MGR_H
 #define __SHA_MB_MGR_H
 
-
+#include 
 #include 
 
-#define NUM_SHA1_DIGEST_WORDS 5
+#define SHA1_DIGEST_WORDS (SHA1_DIGEST_SIZE / sizeof(u32))
 
 enum job_sts { STS_UNKNOWN = 0,
STS_BEING_PROCESSED = 1,
@@ -69,7 +69,7 @@ enum job_sts {STS_UNKNOWN = 0,
 struct job_sha1 {
u8  *buffer;
u32 len;
-   u32 result_digest[NUM_SHA1_DIGEST_WORDS] __aligned(32);
+   u32 result_digest[SHA1_DIGEST_WORDS] __aligned(32);
enumjob_sts status;
void*user_data;
 };
-- 
2.4.10
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html