Re: [PATCH v2 2/4] crypto: ccm - switch to separate cbcmac driver

2017-02-02 Thread Ard Biesheuvel
On 28 January 2017 at 23:33, Ard Biesheuvel  wrote:
> Update the generic CCM driver to defer CBC-MAC processing to a
> dedicated CBC-MAC ahash transform rather than open coding this
> transform (and much of the associated scatterwalk plumbing) in
> the CCM driver itself.
>
> This cleans up the code considerably, but more importantly, it allows
> the use of alternative CBC-MAC implementations that don't suffer from
> performance degradation due to significant setup time (e.g., the NEON
> based AES code needs to load the entire S-box into SIMD registers, which
> cannot be amortized over the entire input when using the AES cipher
> directly)
>
> Signed-off-by: Ard Biesheuvel 

I need to respin this: the idata[] buffer in crypto_ccm_auth() should
be 16 bytes not 6

> ---
>  crypto/Kconfig |   1 +
>  crypto/ccm.c   | 376 +---
>  2 files changed, 240 insertions(+), 137 deletions(-)
>
> diff --git a/crypto/Kconfig b/crypto/Kconfig
> index 160f08e721cc..e8269d1b0282 100644
> --- a/crypto/Kconfig
> +++ b/crypto/Kconfig
> @@ -263,6 +263,7 @@ comment "Authenticated Encryption with Associated Data"
>  config CRYPTO_CCM
> tristate "CCM support"
> select CRYPTO_CTR
> +   select CRYPTO_HASH
> select CRYPTO_AEAD
> help
>   Support for Counter with CBC MAC. Required for IPsec.
> diff --git a/crypto/ccm.c b/crypto/ccm.c
> index 26b924d1e582..62dbaa58eeb0 100644
> --- a/crypto/ccm.c
> +++ b/crypto/ccm.c
> @@ -11,6 +11,7 @@
>   */
>
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -23,11 +24,11 @@
>
>  struct ccm_instance_ctx {
> struct crypto_skcipher_spawn ctr;
> -   struct crypto_spawn cipher;
> +   struct crypto_ahash_spawn mac;
>  };
>
>  struct crypto_ccm_ctx {
> -   struct crypto_cipher *cipher;
> +   struct crypto_ahash *mac;
> struct crypto_skcipher *ctr;
>  };
>
> @@ -44,15 +45,22 @@ struct crypto_rfc4309_req_ctx {
>
>  struct crypto_ccm_req_priv_ctx {
> u8 odata[16];
> -   u8 idata[16];
> u8 auth_tag[16];
> -   u32 ilen;
> u32 flags;
> struct scatterlist src[3];
> struct scatterlist dst[3];
> struct skcipher_request skreq;
>  };
>
> +struct cbcmac_tfm_ctx {
> +   struct crypto_cipher *child;
> +};
> +
> +struct cbcmac_desc_ctx {
> +   unsigned int len;
> +   u8 dg[];
> +};
> +
>  static inline struct crypto_ccm_req_priv_ctx *crypto_ccm_reqctx(
> struct aead_request *req)
>  {
> @@ -84,7 +92,7 @@ static int crypto_ccm_setkey(struct crypto_aead *aead, 
> const u8 *key,
>  {
> struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
> struct crypto_skcipher *ctr = ctx->ctr;
> -   struct crypto_cipher *tfm = ctx->cipher;
> +   struct crypto_ahash *mac = ctx->mac;
> int err = 0;
>
> crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK);
> @@ -96,11 +104,11 @@ static int crypto_ccm_setkey(struct crypto_aead *aead, 
> const u8 *key,
> if (err)
> goto out;
>
> -   crypto_cipher_clear_flags(tfm, CRYPTO_TFM_REQ_MASK);
> -   crypto_cipher_set_flags(tfm, crypto_aead_get_flags(aead) &
> +   crypto_ahash_clear_flags(mac, CRYPTO_TFM_REQ_MASK);
> +   crypto_ahash_set_flags(mac, crypto_aead_get_flags(aead) &
> CRYPTO_TFM_REQ_MASK);
> -   err = crypto_cipher_setkey(tfm, key, keylen);
> -   crypto_aead_set_flags(aead, crypto_cipher_get_flags(tfm) &
> +   err = crypto_ahash_setkey(mac, key, keylen);
> +   crypto_aead_set_flags(aead, crypto_ahash_get_flags(mac) &
>   CRYPTO_TFM_RES_MASK);
>
>  out:
> @@ -167,119 +175,61 @@ static int format_adata(u8 *adata, unsigned int a)
> return len;
>  }
>
> -static void compute_mac(struct crypto_cipher *tfm, u8 *data, int n,
> -  struct crypto_ccm_req_priv_ctx *pctx)
> -{
> -   unsigned int bs = 16;
> -   u8 *odata = pctx->odata;
> -   u8 *idata = pctx->idata;
> -   int datalen, getlen;
> -
> -   datalen = n;
> -
> -   /* first time in here, block may be partially filled. */
> -   getlen = bs - pctx->ilen;
> -   if (datalen >= getlen) {
> -   memcpy(idata + pctx->ilen, data, getlen);
> -   crypto_xor(odata, idata, bs);
> -   crypto_cipher_encrypt_one(tfm, odata, odata);
> -   datalen -= getlen;
> -   data += getlen;
> -   pctx->ilen = 0;
> -   }
> -
> -   /* now encrypt rest of data */
> -   while (datalen >= bs) {
> -   crypto_xor(odata, data, bs);
> -   crypto_cipher_encrypt_one(tfm, odata, odata);
> -
> -   datalen -= bs;
> -   data += bs;
> -   }
> -
> -   /* check and see if there's leftover data that wasn't
> -* enough to fill a block.
> -*/
> -   if (datalen) {
> -   

Re: [PATCH v5 2/3] drivers: crypto: Add the Virtual Function driver for CPT

2017-02-02 Thread Sasha Levin
On Mon, Jan 30, 2017 at 7:30 AM, George Cherian
 wrote:
> diff --git a/drivers/crypto/cavium/cpt/cptvf_main.c 
> b/drivers/crypto/cavium/cpt/cptvf_main.c
> new file mode 100644
> index 000..4cf466d
> --- /dev/null
> +++ b/drivers/crypto/cavium/cpt/cptvf_main.c
> @@ -0,0 +1,948 @@
> +/*
> + * Copyright (C) 2016 Cavium, Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of version 2 of the GNU General Public License
> + * as published by the Free Software Foundation.
> + */
> +
> +#include 
> +#include 
> +
> +#include "cptvf.h"
> +
> +#define DRV_NAME   "thunder-cptvf"
> +#define DRV_VERSION"1.0"
> +
> +struct cptvf_wqe {
> +   struct tasklet_struct twork;
> +   void *cptvf;
> +   u32 qno;
> +};
> +
> +struct cptvf_wqe_info {
> +   struct cptvf_wqe vq_wqe[CPT_NUM_QS_PER_VF];
> +};
> +
> +static void vq_work_handler(unsigned long data)
> +{
> +   struct cptvf_wqe_info *cwqe_info = (struct cptvf_wqe_info *)data;
> +   struct cptvf_wqe *cwqe = _info->vq_wqe[0];
> +
> +   vq_post_process(cwqe->cptvf, cwqe->qno);
> +}
> +
> +static int init_worker_threads(struct cpt_vf *cptvf)
> +{
> +   struct pci_dev *pdev = cptvf->pdev;
> +   struct cptvf_wqe_info *cwqe_info;
> +   int i;
> +
> +   cwqe_info = kzalloc(sizeof(*cwqe_info), GFP_KERNEL);
> +   if (!cwqe_info)
> +   return -ENOMEM;
> +
> +   if (cptvf->nr_queues) {
> +   dev_info(>dev, "Creating VQ worker threads (%d)\n",
> +cptvf->nr_queues);
> +   }
> +
> +   for (i = 0; i < cptvf->nr_queues; i++) {
> +   tasklet_init(_info->vq_wqe[i].twork, vq_work_handler,
> +(u64)cwqe_info);
> +   cwqe_info->vq_wqe[i].qno = i;
> +   cwqe_info->vq_wqe[i].cptvf = cptvf;
> +   }
> +
> +   cptvf->wqe_info = cwqe_info;
> +
> +   return 0;
> +}
> +
> +static void cleanup_worker_threads(struct cpt_vf *cptvf)
> +{
> +   struct cptvf_wqe_info *cwqe_info;
> +   struct pci_dev *pdev = cptvf->pdev;
> +   int i;
> +
> +   cwqe_info = (struct cptvf_wqe_info *)cptvf->wqe_info;
> +   if (!cwqe_info)
> +   return;
> +
> +   if (cptvf->nr_queues) {
> +   dev_info(>dev, "Cleaning VQ worker threads (%u)\n",
> +cptvf->nr_queues);
> +   }
> +
> +   for (i = 0; i < cptvf->nr_queues; i++)
> +   tasklet_kill(_info->vq_wqe[i].twork);
> +
> +   kzfree(cwqe_info);
> +   cptvf->wqe_info = NULL;
> +}
> +
> +static void free_pending_queues(struct pending_qinfo *pqinfo)
> +{
> +   int i;
> +   struct pending_queue *queue;
> +
> +   for_each_pending_queue(pqinfo, queue, i) {
> +   if (!queue->head)
> +   continue;
> +
> +   /* free single queue */
> +   kzfree((queue->head));
> +
> +   queue->front = 0;
> +   queue->rear = 0;
> +
> +   return;
> +   }
> +
> +   pqinfo->qlen = 0;
> +   pqinfo->nr_queues = 0;
> +}
> +
> +static int alloc_pending_queues(struct pending_qinfo *pqinfo, u32 qlen,
> +   u32 nr_queues)
> +{
> +   u32 i;
> +   size_t size;
> +   int ret;
> +   struct pending_queue *queue = NULL;
> +
> +   pqinfo->nr_queues = nr_queues;
> +   pqinfo->qlen = qlen;
> +
> +   size = (qlen * sizeof(struct pending_entry));
> +
> +   for_each_pending_queue(pqinfo, queue, i) {
> +   queue->head = kzalloc((size), GFP_KERNEL);
> +   if (!queue->head) {
> +   ret = -ENOMEM;
> +   goto pending_qfail;
> +   }
> +
> +   queue->front = 0;
> +   queue->rear = 0;
> +   atomic64_set((>pending_count), (0));
> +
> +   /* init queue spin lock */
> +   spin_lock_init(>lock);
> +   }
> +
> +   return 0;
> +
> +pending_qfail:
> +   free_pending_queues(pqinfo);
> +
> +   return ret;
> +}
> +
> +static int init_pending_queues(struct cpt_vf *cptvf, u32 qlen, u32 nr_queues)
> +{
> +   struct pci_dev *pdev = cptvf->pdev;
> +   int ret;
> +
> +   if (!nr_queues)
> +   return 0;
> +
> +   ret = alloc_pending_queues(>pqinfo, qlen, nr_queues);
> +   if (ret) {
> +   dev_err(>dev, "failed to setup pending queues (%u)\n",
> +   nr_queues);
> +   return ret;
> +   }
> +
> +   return 0;
> +}
> +
> +static void cleanup_pending_queues(struct cpt_vf *cptvf)
> +{
> +   struct pci_dev *pdev = cptvf->pdev;
> +
> +   if (!cptvf->nr_queues)
> +   return;
> +
> +   dev_info(>dev, "Cleaning VQ pending queue (%u)\n",
> +cptvf->nr_queues);
> +   free_pending_queues(>pqinfo);
> +}
> +
> +static void free_command_queues(struct 

Re: [PATCH] crypto: ccp: Fix double add when creating new DMA command

2017-02-02 Thread Gary R Hook

On 01/27/2017 05:09 PM, Gary R Hook wrote:

Eliminate a double-add by creating a new list to manage
command descriptors when created; move the descriptor to
the pending list when the command is submitted.


Herbert,

Another patch that could use some 4.10 love. Possible?

Thanks,
Gary




Signed-off-by: Gary R Hook 
---
 drivers/crypto/ccp/ccp-dev.h   |1 +
 drivers/crypto/ccp/ccp-dmaengine.c |6 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/ccp/ccp-dev.h b/drivers/crypto/ccp/ccp-dev.h
index 830f35e..649e561 100644
--- a/drivers/crypto/ccp/ccp-dev.h
+++ b/drivers/crypto/ccp/ccp-dev.h
@@ -238,6 +238,7 @@ struct ccp_dma_chan {
struct ccp_device *ccp;

spinlock_t lock;
+   struct list_head created;
struct list_head pending;
struct list_head active;
struct list_head complete;
diff --git a/drivers/crypto/ccp/ccp-dmaengine.c 
b/drivers/crypto/ccp/ccp-dmaengine.c
index 6553912..e5d9278 100644
--- a/drivers/crypto/ccp/ccp-dmaengine.c
+++ b/drivers/crypto/ccp/ccp-dmaengine.c
@@ -63,6 +63,7 @@ static void ccp_free_chan_resources(struct dma_chan *dma_chan)
ccp_free_desc_resources(chan->ccp, >complete);
ccp_free_desc_resources(chan->ccp, >active);
ccp_free_desc_resources(chan->ccp, >pending);
+   ccp_free_desc_resources(chan->ccp, >created);

spin_unlock_irqrestore(>lock, flags);
 }
@@ -273,6 +274,7 @@ static dma_cookie_t ccp_tx_submit(struct 
dma_async_tx_descriptor *tx_desc)
spin_lock_irqsave(>lock, flags);

cookie = dma_cookie_assign(tx_desc);
+   list_del(>entry);
list_add_tail(>entry, >pending);

spin_unlock_irqrestore(>lock, flags);
@@ -426,7 +428,7 @@ static struct ccp_dma_desc *ccp_create_desc(struct dma_chan 
*dma_chan,

spin_lock_irqsave(>lock, sflags);

-   list_add_tail(>entry, >pending);
+   list_add_tail(>entry, >created);

spin_unlock_irqrestore(>lock, sflags);

@@ -610,6 +612,7 @@ static int ccp_terminate_all(struct dma_chan *dma_chan)
/*TODO: Purge the complete list? */
ccp_free_desc_resources(chan->ccp, >active);
ccp_free_desc_resources(chan->ccp, >pending);
+   ccp_free_desc_resources(chan->ccp, >created);

spin_unlock_irqrestore(>lock, flags);

@@ -679,6 +682,7 @@ int ccp_dmaengine_register(struct ccp_device *ccp)
chan->ccp = ccp;

spin_lock_init(>lock);
+   INIT_LIST_HEAD(>created);
INIT_LIST_HEAD(>pending);
INIT_LIST_HEAD(>active);
INIT_LIST_HEAD(>complete);

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
This is my day job. Follow me at:
IG/Twitter/Facebook: @grhookphoto
IG/Twitter/Facebook: @grhphotographer


Re: [PATCH] crypto: ccp: Fix DMA operations when IOMMU is enabled

2017-02-02 Thread Gary R Hook

On 01/27/2017 03:28 PM, Gary R Hook wrote:

An I/O page fault occurs when the IOMMU is enabled on a
system that supports the v5 CCP.  DMA operations use a
Request ID value that does not match what is expected by
the IOMMU, resulting in the I/O page fault.  Setting the
Request ID value to 0 corrects this issue.


Herbert,

Hoping to get some 4.10 love on this patch, plus one other. Do we see
a possibility at this late date?



Signed-off-by: Gary R Hook 
---
 drivers/crypto/ccp/ccp-dev-v5.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/ccp/ccp-dev-v5.c b/drivers/crypto/ccp/ccp-dev-v5.c
index e2ce819..612898b 100644
--- a/drivers/crypto/ccp/ccp-dev-v5.c
+++ b/drivers/crypto/ccp/ccp-dev-v5.c
@@ -959,7 +959,7 @@ static irqreturn_t ccp5_irq_handler(int irq, void *data)
 static void ccp5_config(struct ccp_device *ccp)
 {
/* Public side */
-   iowrite32(0x1249, ccp->io_regs + CMD5_REQID_CONFIG_OFFSET);
+   iowrite32(0x0, ccp->io_regs + CMD5_REQID_CONFIG_OFFSET);
 }

 static void ccp5other_config(struct ccp_device *ccp)

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
This is my day job. Follow me at:
IG/Twitter/Facebook: @grhookphoto
IG/Twitter/Facebook: @grhphotographer


Re: [PATCH] crypto: arm64/crc32 - merge CRC32 and PMULL instruction based drivers

2017-02-02 Thread Matthias Brugger



On 02/01/2017 04:35 PM, Ard Biesheuvel wrote:

The PMULL based CRC32 implementation already contains code based on the
separate, optional CRC32 instructions to fallback to when operating on
small quantities of data. We can expose these routines directly on systems
that lack the 64x64 PMULL instructions but do implement the CRC32 ones,
which makes the driver that is based solely on those CRC32 instructions
redundant. So remove it.

Note that this aligns arm64 with ARM, whose accelerated CRC32 driver
also combines the CRC32 extension based and the PMULL based versions.

Signed-off-by: Ard Biesheuvel 
---

This is a meaningful patch by itself imho, but also fixes the issue reported
by Matthias where their v4.8 based GCC does not understand the -mcpu=generic+crc
command line option, resulting in failed builds.



This patch fixes the issue, thanks.
Tested-by: Matthias Brugger 


 arch/arm64/configs/defconfig  |   1 -
 arch/arm64/crypto/Kconfig |   9 +-
 arch/arm64/crypto/Makefile|   4 -
 arch/arm64/crypto/crc32-arm64.c   | 290 
 arch/arm64/crypto/crc32-ce-glue.c |  49 +++-
 5 files changed, 41 insertions(+), 312 deletions(-)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 33b744d54739..6fc6f5a2a6e5 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -516,4 +516,3 @@ CONFIG_CRYPTO_GHASH_ARM64_CE=y
 CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
 # CONFIG_CRYPTO_AES_ARM64_NEON_BLK is not set
-CONFIG_CRYPTO_CRC32_ARM64=y
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index bed7feddfeed..d92293747d63 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -37,8 +37,8 @@ config CRYPTO_CRCT10DIF_ARM64_CE
select CRYPTO_HASH

 config CRYPTO_CRC32_ARM64_CE
-   tristate "CRC32 and CRC32C digest algorithms using PMULL instructions"
-   depends on KERNEL_MODE_NEON && CRC32
+   tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions"
+   depends on CRC32
select CRYPTO_HASH

 config CRYPTO_AES_ARM64
@@ -71,11 +71,6 @@ config CRYPTO_AES_ARM64_NEON_BLK
select CRYPTO_AES
select CRYPTO_SIMD

-config CRYPTO_CRC32_ARM64
-   tristate "CRC32 and CRC32C using optional ARMv8 instructions"
-   depends on ARM64
-   select CRYPTO_HASH
-
 config CRYPTO_CHACHA20_NEON
tristate "NEON accelerated ChaCha20 symmetric cipher"
depends on KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index d1ae1b9cbe70..b5edc5918c28 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -55,10 +55,6 @@ AFLAGS_aes-neon.o:= -DINTERLEAVE=4

 CFLAGS_aes-glue-ce.o   := -DUSE_V8_CRYPTO_EXTENSIONS

-obj-$(CONFIG_CRYPTO_CRC32_ARM64) += crc32-arm64.o
-
-CFLAGS_crc32-arm64.o   := -mcpu=generic+crc
-
 $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
$(call if_changed_rule,cc_o_c)

diff --git a/arch/arm64/crypto/crc32-arm64.c b/arch/arm64/crypto/crc32-arm64.c
deleted file mode 100644
index 6a37c3c6b11d..
--- a/arch/arm64/crypto/crc32-arm64.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * crc32-arm64.c - CRC32 and CRC32C using optional ARMv8 instructions
- *
- * Module based on crypto/crc32c_generic.c
- *
- * CRC32 loop taken from Ed Nevill's Hadoop CRC patch
- * 
http://mail-archives.apache.org/mod_mbox/hadoop-common-dev/201406.mbox/%3C1403687030.3355.19.camel%40localhost.localdomain%3E
- *
- * Using inline assembly instead of intrinsics in order to be backwards
- * compatible with older compilers.
- *
- * Copyright (C) 2014 Linaro Ltd 
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#include 
-
-MODULE_AUTHOR("Yazen Ghannam ");
-MODULE_DESCRIPTION("CRC32 and CRC32C using optional ARMv8 instructions");
-MODULE_LICENSE("GPL v2");
-
-#define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], 
%x[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], 
%w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], 
%w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], 
%w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], 
%x[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], 
%w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], 
%w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], 
%w[v]":[c]"+r"(crc):[v]"r"(value))
-
-static u32 crc32_arm64_le_hw(u32 crc, const u8 *p, unsigned int 

[PATCH v3] crypto: aes - add generic time invariant AES cipher

2017-02-02 Thread Ard Biesheuvel
Lookup table based AES is sensitive to timing attacks, which is due to
the fact that such table lookups are data dependent, and the fact that
8 KB worth of tables covers a significant number of cachelines on any
architecture, resulting in an exploitable correlation between the key
and the processing time for known plaintexts.

For network facing algorithms such as CTR, CCM or GCM, this presents a
security risk, which is why arch specific AES ports are typically time
invariant, either through the use of special instructions, or by using
SIMD algorithms that don't rely on table lookups.

For generic code, this is difficult to achieve without losing too much
performance, but we can improve the situation significantly by switching
to an implementation that only needs 256 bytes of table data (the actual
S-box itself), which can be prefetched at the start of each block to
eliminate data dependent latencies.

This code encrypts at ~25 cycles per byte on ARM Cortex-A57 (while the
ordinary generic AES driver manages 18 cycles per byte on this
hardware). Decryption is substantially slower.

Signed-off-by: Ard Biesheuvel 
---
Sending this out as a separate patch since the CCM/CBCMAC series that this
was part of has not actually changed, and does not really depend on this
patch at all.

I have included a decrypt() and a setkey() routine, which allows this driver
to be used as a replacement for the generic AES code. However, the lookup
tables exposed by the generic AES driver are used in a couple of other places
as well.

 crypto/Kconfig  |  17 +
 crypto/Makefile |   1 +
 crypto/aes_ti.c | 375 
 3 files changed, 393 insertions(+)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index e8269d1b0282..5a51b877277e 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -896,6 +896,23 @@ config CRYPTO_AES
 
  See  for more information.
 
+config CRYPTO_AES_TI
+   tristate "Fixed time AES cipher"
+   select CRYPTO_ALGAPI
+   help
+ This is a generic implementation of AES that attempts to eliminate
+ data dependent latencies as much as possible without affecting
+ performance too much. It is intended for use by the generic CCM
+ and GCM drivers, and other CTR or CMAC/XCBC based modes that rely
+ solely on encryption (although decryption is supported as well, but
+ with a more dramatic performance hit)
+
+ Instead of using 16 lookup tables of 1 KB each, (8 for encryption and
+ 8 for decryption), this implementation only uses just two S-boxes of
+ 256 bytes each, and attempts to eliminate data dependent latencies by
+ prefetching the entire table into the cache at the start of each
+ block.
+
 config CRYPTO_AES_586
tristate "AES cipher algorithms (i586)"
depends on (X86 || UML_X86) && !64BIT
diff --git a/crypto/Makefile b/crypto/Makefile
index b8f0e3eb0791..bcd834536163 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_CRYPTO_TWOFISH) += twofish_generic.o
 obj-$(CONFIG_CRYPTO_TWOFISH_COMMON) += twofish_common.o
 obj-$(CONFIG_CRYPTO_SERPENT) += serpent_generic.o
 obj-$(CONFIG_CRYPTO_AES) += aes_generic.o
+obj-$(CONFIG_CRYPTO_AES_TI) += aes_ti.o
 obj-$(CONFIG_CRYPTO_CAMELLIA) += camellia_generic.o
 obj-$(CONFIG_CRYPTO_CAST_COMMON) += cast_common.o
 obj-$(CONFIG_CRYPTO_CAST5) += cast5_generic.o
diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c
new file mode 100644
index ..92644fd1ac19
--- /dev/null
+++ b/crypto/aes_ti.c
@@ -0,0 +1,375 @@
+/*
+ * Scalar fixed time AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * Emit the sbox as volatile const to prevent the compiler from doing
+ * constant folding on sbox references involving fixed indexes.
+ */
+static volatile const u8 __cacheline_aligned __aesti_sbox[] = {
+   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
+   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
+   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
+   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
+   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
+   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+   0x51, 0xa3, 0x40, 

Re: crypto: NULL deref in sha512_mb_mgr_get_comp_job_avx2

2017-02-02 Thread Tim Chen
On Thu, 2017-02-02 at 11:58 +0100, Dmitry Vyukov wrote:
> On Wed, Feb 1, 2017 at 7:45 PM, Tim Chen  wrote:
> > 
> > On Tue, Jan 31, 2017 at 02:16:31PM +0100, Dmitry Vyukov wrote:
> > > 
> > > Hello,
> > > 
> > > I am getting the following reports with low frequency while running
> > > syzkaller fuzzer. Unfortunately they are not reproducible and happen
> > > in a background thread, so it is difficult to extract any context on
> > > my side. I see only few such crashes per week, so most likely it is
> > > some hard to trigger data race. The following reports are from mmotm
> > > tree, commits 00e20cfc2bf04a0cbe1f5405f61c8426f43eee84 and
> > > fff7e71eac7788904753136f09bcad7471f7799e. Any ideas as to how this can
> > > happen?
> > > 
> > > BUG: unable to handle kernel NULL pointer dereference at 0060
> > > IP: [] sha512_mb_mgr_get_comp_job_avx2+0x6e/0xee
> > > arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S:251
> > > PGD 1d2395067 [  220.874864] PUD 1d2860067
> > > Oops: 0002 [#1] SMP KASAN
> > > Dumping ftrace buffer:
> > >    (ftrace buffer empty)
> > > Modules linked in:
> > > CPU: 0 PID: 516 Comm: kworker/0:1 Not tainted 4.9.0 #4
> > > Hardware name: Google Google Compute Engine/Google Compute Engine,
> > > BIOS Google 01/01/2011
> > > Workqueue: crypto mcryptd_queue_worker
> > > task: 8801d9f346c0 task.stack: 8801d9f08000
> > > RIP: 0010:[]  []
> > > sha512_mb_mgr_get_comp_job_avx2+0x6e/0xee
> > > arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S:251
> > > RSP: 0018:8801d9f0eef8  EFLAGS: 00010202
> > > RAX:  RBX: 8801d7db1190 RCX: 0006
> > > RDX: 0001 RSI: 8801d9f34ee8 RDI: 8801d7db1040
> > > RBP: 8801d9f0f258 R08: 0001 R09: 0001
> > > R10: 0002 R11: 0003 R12: 8801d9f0f230
> > > R13: 8801c8bbc4e0 R14: 8801c8bbc530 R15: 8801d9f0ef70
> > > FS:  () GS:8801dc00() 
> > > knlGS:
> > > CS:  0010 DS:  ES:  CR0: 80050033
> > > CR2: 0060 CR3: 0001cc15a000 CR4: 001406f0
> > > DR0:  DR1:  DR2: 
> > > DR3:  DR6: fffe0ff0 DR7: 0400
> > > Stack:
> > >  8801d7db1040 813fa207 dc00 e8c0f238
> > >  0002 11003b3e1dea e8c0f218 8801d9f0f190
> > >  0282 e8c0f140 e8c0f220 41b58ab3
> > > Call Trace:
> > >  [] sha512_mb_update+0x2f7/0x4e0
> > > arch/x86/crypto/sha512-mb/sha512_mb.c:588
> > >  [] crypto_ahash_update include/crypto/hash.h:512 
> > > [inline]
> > >  [] ahash_mcryptd_update crypto/mcryptd.c:627 [inline]
> > >  [] mcryptd_hash_update+0xcd/0x1c0 crypto/mcryptd.c:373
> > >  [] mcryptd_queue_worker+0xff/0x6a0 crypto/mcryptd.c:181
> > >  [] process_one_work+0xbd0/0x1c10 
> > > kernel/workqueue.c:2096
> > >  [] worker_thread+0x223/0x1990 kernel/workqueue.c:2230
> > >  [] kthread+0x323/0x3e0 kernel/kthread.c:209
> > >  [] ret_from_fork+0x2a/0x40 
> > > arch/x86/entry/entry_64.S:433
> > > Code: 49 0f 42 d3 48 f7 c2 f0 ff ff ff 0f 85 9a 00 00 00 48 83 e2 0f
> > > 48 6b da 08 48 8d 9c 1f 48 01 00 00 48 8b 03 48 c7 03 00 00 00 00 
> > > 40 60 02 00 00 00 48 8b 9f 40 01 00 00 48 c1 e3 08 48 09 d3
> > > RIP  [] sha512_mb_mgr_get_comp_job_avx2+0x6e/0xee
> > > arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S:251
> > >  RSP 
> > > CR2: 0060
> > > ---[ end trace 139fd4cda5dfe2c4 ]---
> > > 
> > Dmitry,
> > 
> > One theory that Mehga and I have is that perhaps the flusher
> > and regular computaion updates are stepping on each other.
> > Can you try this patch and see if it helps?
> 
> No, for this one I can't. Sorry.
> It happens with very low frequency and only one fuzzer that tests
> mmotm tree. If/when this is committed, I can keep an eye on these
> reports and notify if I still see them.
> If you have a hypothesis as to how it happens, perhaps you could write
> a test that provokes the crash and maybe add some sleeps to kernel
> code or alter timeouts to increase probability.
> 

If this patch is merged, it will most likely go into Herbert's crypto-dev
tree and not Andrew's mm tree for testing.  We will try to do the best
on our side for testing to replicate the crash scenario.  

Will it be possible to have one of your fuzzer run the crypto-dev tree 
once the patch got merged there?  

Thanks.

Tim

> 
> 
> > 
> > --->8---
> > 
> > From: Tim Chen 
> > Subject: [PATCH] crypto/sha512-mb: Protect sha512 mb ctx mgr access
> > To: Herbert Xu , Dmitry Vyukov 
> > 
> > Cc: Tim Chen , David Miller 
> > , linux-crypto@vger.kernel.org, LKML 
> > , megha@linux.intel.com, 
> > fenghua...@intel.com
> > 
> > The flusher and 

[PATCH] crypto: generic/aes - drop alignment requirement

2017-02-02 Thread Ard Biesheuvel
The generic AES code exposes a 32-bit align mask, which forces all
users of the code to use temporary buffers or take other measures to
ensure the alignment requirement is adhered to, even on architectures
that don't care about alignment for software algorithms such as this
one.

So drop the align mask, and fix the code to use get_unaligned_le32()
where appropriate, which will resolve to whatever is optimal for the
architecture.

Signed-off-by: Ard Biesheuvel 
---
 crypto/aes_generic.c | 64 ++--
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c
index 3dd101144a58..ca554d57d01e 100644
--- a/crypto/aes_generic.c
+++ b/crypto/aes_generic.c
@@ -54,6 +54,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static inline u8 byte(const u32 x, const unsigned n)
 {
@@ -1216,7 +1217,6 @@ EXPORT_SYMBOL_GPL(crypto_il_tab);
 int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
unsigned int key_len)
 {
-   const __le32 *key = (const __le32 *)in_key;
u32 i, t, u, v, w, j;
 
if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
@@ -1225,10 +1225,15 @@ int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, 
const u8 *in_key,
 
ctx->key_length = key_len;
 
-   ctx->key_dec[key_len + 24] = ctx->key_enc[0] = le32_to_cpu(key[0]);
-   ctx->key_dec[key_len + 25] = ctx->key_enc[1] = le32_to_cpu(key[1]);
-   ctx->key_dec[key_len + 26] = ctx->key_enc[2] = le32_to_cpu(key[2]);
-   ctx->key_dec[key_len + 27] = ctx->key_enc[3] = le32_to_cpu(key[3]);
+   ctx->key_enc[0] = get_unaligned_le32(in_key);
+   ctx->key_enc[1] = get_unaligned_le32(in_key + 4);
+   ctx->key_enc[2] = get_unaligned_le32(in_key + 8);
+   ctx->key_enc[3] = get_unaligned_le32(in_key + 12);
+
+   ctx->key_dec[key_len + 24] = ctx->key_enc[0];
+   ctx->key_dec[key_len + 25] = ctx->key_enc[1];
+   ctx->key_dec[key_len + 26] = ctx->key_enc[2];
+   ctx->key_dec[key_len + 27] = ctx->key_enc[3];
 
switch (key_len) {
case AES_KEYSIZE_128:
@@ -1238,17 +1243,17 @@ int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, 
const u8 *in_key,
break;
 
case AES_KEYSIZE_192:
-   ctx->key_enc[4] = le32_to_cpu(key[4]);
-   t = ctx->key_enc[5] = le32_to_cpu(key[5]);
+   ctx->key_enc[4] = get_unaligned_le32(in_key + 16);
+   t = ctx->key_enc[5] = get_unaligned_le32(in_key + 20);
for (i = 0; i < 8; ++i)
loop6(i);
break;
 
case AES_KEYSIZE_256:
-   ctx->key_enc[4] = le32_to_cpu(key[4]);
-   ctx->key_enc[5] = le32_to_cpu(key[5]);
-   ctx->key_enc[6] = le32_to_cpu(key[6]);
-   t = ctx->key_enc[7] = le32_to_cpu(key[7]);
+   ctx->key_enc[4] = get_unaligned_le32(in_key + 16);
+   ctx->key_enc[5] = get_unaligned_le32(in_key + 20);
+   ctx->key_enc[6] = get_unaligned_le32(in_key + 24);
+   t = ctx->key_enc[7] = get_unaligned_le32(in_key + 28);
for (i = 0; i < 6; ++i)
loop8(i);
loop8tophalf(i);
@@ -1329,16 +1334,14 @@ EXPORT_SYMBOL_GPL(crypto_aes_set_key);
 static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-   const __le32 *src = (const __le32 *)in;
-   __le32 *dst = (__le32 *)out;
u32 b0[4], b1[4];
const u32 *kp = ctx->key_enc + 4;
const int key_len = ctx->key_length;
 
-   b0[0] = le32_to_cpu(src[0]) ^ ctx->key_enc[0];
-   b0[1] = le32_to_cpu(src[1]) ^ ctx->key_enc[1];
-   b0[2] = le32_to_cpu(src[2]) ^ ctx->key_enc[2];
-   b0[3] = le32_to_cpu(src[3]) ^ ctx->key_enc[3];
+   b0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
+   b0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4);
+   b0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
+   b0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
 
if (key_len > 24) {
f_nround(b1, b0, kp);
@@ -1361,10 +1364,10 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 
*out, const u8 *in)
f_nround(b1, b0, kp);
f_lround(b0, b1, kp);
 
-   dst[0] = cpu_to_le32(b0[0]);
-   dst[1] = cpu_to_le32(b0[1]);
-   dst[2] = cpu_to_le32(b0[2]);
-   dst[3] = cpu_to_le32(b0[3]);
+   put_unaligned_le32(b0[0], out);
+   put_unaligned_le32(b0[1], out + 4);
+   put_unaligned_le32(b0[2], out + 8);
+   put_unaligned_le32(b0[3], out + 12);
 }
 
 /* decrypt a block of text */
@@ -1401,16 +1404,14 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 
*out, const u8 *in)
 static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-   const __le32 *src = (const 

[PATCH v2] crypto: algapi - make crypto_xor() and crypto_inc() alignment agnostic

2017-02-02 Thread Ard Biesheuvel
Instead of unconditionally forcing 4 byte alignment for all generic
chaining modes that rely on crypto_xor() or crypto_inc() (which may
result in unnecessary copying of data when the underlying hardware
can perform unaligned accesses efficiently), make those functions
deal with unaligned input explicitly, but only if the Kconfig symbol
HAVE_EFFICIENT_UNALIGNED_ACCESS is set. This will allow us to drop
the alignmasks from the CBC, CMAC, CTR, CTS, PCBC and SEQIV drivers.

For crypto_inc(), this simply involves making the 4-byte stride
conditional on HAVE_EFFICIENT_UNALIGNED_ACCESS being set, given that
it typically operates on 16 byte buffers.

For crypto_xor(), an algorithm is implemented that simply runs through
the input using the largest strides possible if unaligned accesses are
allowed. If they are not, an optimal sequence of memory accesses is
emitted that takes the relative alignment of the input buffers into
account, e.g., if the relative misalignment of dst and src is 4 bytes,
the entire xor operation will be completed using 4 byte loads and stores
(modulo unaligned bits at the start and end). Note that all expressions
involving misalign are simply eliminated by the compiler when
HAVE_EFFICIENT_UNALIGNED_ACCESS is defined.

Signed-off-by: Ard Biesheuvel 
---

I have greatly simplified the code, but it should still emit an optimal
sequence of loads and stores depending on the misalignment.

 crypto/algapi.c | 65 +++-
 crypto/cbc.c|  3 -
 crypto/cmac.c   |  3 +-
 crypto/ctr.c|  2 +-
 crypto/cts.c|  3 -
 crypto/pcbc.c   |  3 -
 crypto/seqiv.c  |  2 -
 7 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/crypto/algapi.c b/crypto/algapi.c
index df939b54b09f..e05ed1c5f5d1 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -961,32 +961,63 @@ void crypto_inc(u8 *a, unsigned int size)
__be32 *b = (__be32 *)(a + size);
u32 c;
 
-   for (; size >= 4; size -= 4) {
-   c = be32_to_cpu(*--b) + 1;
-   *b = cpu_to_be32(c);
-   if (c)
-   return;
-   }
+   if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+   !((unsigned long)b & (__alignof__(*b) - 1)))
+   for (; size >= 4; size -= 4) {
+   c = be32_to_cpu(*--b) + 1;
+   *b = cpu_to_be32(c);
+   if (c)
+   return;
+   }
 
crypto_inc_byte(a, size);
 }
 EXPORT_SYMBOL_GPL(crypto_inc);
 
-static inline void crypto_xor_byte(u8 *a, const u8 *b, unsigned int size)
+void crypto_xor(u8 *dst, const u8 *src, unsigned int len)
 {
-   for (; size; size--)
-   *a++ ^= *b++;
-}
+   const int size = sizeof(unsigned long);
+   int delta = ((unsigned long)dst ^ (unsigned long)src) & (size - 1);
+   int misalign = 0;
+
+   if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && delta) {
+   misalign = 1 << __ffs(delta);
+
+   /*
+* If we care about alignment, process as many bytes as
+* needed to advance dst and src to values whose alignments
+* equal their relative misalignment. This will allow us to
+* process the remainder of the input using optimal strides.
+*/
+   while (((unsigned long)dst & (misalign - 1)) && len > 0) {
+   *dst++ ^= *src++;
+   len--;
+   }
+   }
 
-void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
-{
-   u32 *a = (u32 *)dst;
-   u32 *b = (u32 *)src;
+   while (len >= size && !misalign) {
+   *(unsigned long *)dst ^= *(unsigned long *)src;
+   dst += size;
+   src += size;
+   len -= size;
+   }
 
-   for (; size >= 4; size -= 4)
-   *a++ ^= *b++;
+   while (IS_ENABLED(CONFIG_64BIT) && len >= 4 && !(misalign & 3)) {
+   *(u32 *)dst ^= *(u32 *)src;
+   dst += 4;
+   src += 4;
+   len -= 4;
+   }
+
+   while (len >= 2 && !(misalign & 1)) {
+   *(u16 *)dst ^= *(u16 *)src;
+   dst += 2;
+   src += 2;
+   len -= 2;
+   }
 
-   crypto_xor_byte((u8 *)a, (u8 *)b, size);
+   while (len--)
+   *dst++ ^= *src++;
 }
 EXPORT_SYMBOL_GPL(crypto_xor);
 
diff --git a/crypto/cbc.c b/crypto/cbc.c
index 68f751a41a84..bc160a3186dc 100644
--- a/crypto/cbc.c
+++ b/crypto/cbc.c
@@ -145,9 +145,6 @@ static int crypto_cbc_create(struct crypto_template *tmpl, 
struct rtattr **tb)
inst->alg.base.cra_blocksize = alg->cra_blocksize;
inst->alg.base.cra_alignmask = alg->cra_alignmask;
 
-   /* We access the data as u32s when xoring. */
-   inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
-
inst->alg.ivsize = alg->cra_blocksize;

Re: [PATCH v3 2/3] crypto: brcm: Add Broadcom SPU driver

2017-02-02 Thread Rob Rice
Herbert,


> On Feb 2, 2017, at 9:05 AM, Herbert Xu  wrote:
> 
> On Wed, Jan 25, 2017 at 11:44:48AM -0500, Rob Rice wrote:
>> 
>> +static int ahash_export(struct ahash_request *req, void *out)
>> +{
>> +const struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
>> +
>> +memcpy(out, rctx, offsetof(struct iproc_reqctx_s, msg_buf));
>> +return 0;
>> +}
> 
> The reqctx data structure seems to contain a lot of info unrelated
> to the hash state.  Can't we get away with just copying the hash
> state (incr_hash) itself?

Yes, I see your point. I’ll whittle the export state down to just what’s needed 
for the hash.

> 
> Cheers,
> -- 
> Email: Herbert Xu 
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt



Re: [PATCH v3 2/3] crypto: brcm: Add Broadcom SPU driver

2017-02-02 Thread Herbert Xu
On Wed, Jan 25, 2017 at 11:44:48AM -0500, Rob Rice wrote:
>
> +static int ahash_export(struct ahash_request *req, void *out)
> +{
> + const struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
> +
> + memcpy(out, rctx, offsetof(struct iproc_reqctx_s, msg_buf));
> + return 0;
> +}

The reqctx data structure seems to contain a lot of info unrelated
to the hash state.  Can't we get away with just copying the hash
state (incr_hash) itself?

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH 0/6] Add support for ECDSA algorithm

2017-02-02 Thread Herbert Xu
On Thu, Jan 26, 2017 at 11:30:04AM +0530, Nitin Kumbhar wrote:
> 
> This ECDSA implementation is analogous to the RSA kernel implementation for
> signature generation / verification. It extends ECC family of algorithms
> like ECDH to support signature verification using akcipher. This will be
> used in a way similar to RSA.

Yes but RSA had an in-kernel user in the form of module signature
verification.  We don't add algorithms to the kernel without
actual users.  So this patch-set needs to come with an actual
in-kernel user of ECDSA.

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH -stable] crypto: ccm - deal with CTR ciphers that honour iv_out

2017-02-02 Thread Ard Biesheuvel
On 2 February 2017 at 09:53, Herbert Xu  wrote:
> On Thu, Feb 02, 2017 at 08:01:47AM +, Ard Biesheuvel wrote:
>>
>> You are right: due to its construction, the CCM mode does not care
>> about the incremented counter because it clears the counter part of
>> the IV before encrypting the MAC. So this is caused by an optimization
>> in my code rather than the CCM code being incorrect.
>
> OK so you will send me an update for the ARM64 code, right?
>

Yes, on their way

Thanks,
Ard.


[PATCH 2/2] crypto: arm/aes - don't use IV buffer to return final keystream block

2017-02-02 Thread Ard Biesheuvel
The ARM bit sliced AES core code uses the IV buffer to pass the final
keystream block back to the glue code if the input is not a multiple of
the block size, so that the asm code does not have to deal with anything
except 16 byte blocks. This is done under the assumption that the outgoing
IV is meaningless anyway in this case, given that chaining is no longer
possible under these circumstances.

However, as it turns out, the CCM driver does expect the IV to retain
a value that is equal to the original IV except for the counter value,
and even interprets byte zero as a length indicator, which may result
in memory corruption if the IV is overwritten with something else.

So use a separate buffer to return the final keystream block.

Signed-off-by: Ard Biesheuvel 
---
 arch/arm/crypto/aes-neonbs-core.S | 16 +---
 arch/arm/crypto/aes-neonbs-glue.c |  9 +
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/arm/crypto/aes-neonbs-core.S 
b/arch/arm/crypto/aes-neonbs-core.S
index c9477044fbba..2764edc56467 100644
--- a/arch/arm/crypto/aes-neonbs-core.S
+++ b/arch/arm/crypto/aes-neonbs-core.S
@@ -779,14 +779,15 @@ ENDPROC(aesbs_cbc_decrypt)
 
/*
 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
-*   int rounds, int blocks, u8 ctr[], bool final)
+*   int rounds, int blocks, u8 ctr[], u8 final[])
 */
 ENTRY(aesbs_ctr_encrypt)
mov ip, sp
push{r4-r10, lr}
 
ldm ip, {r5-r7} // load args 4-6
-   add r5, r5, r7  // one extra block if final == 1
+   teq r7, #0
+   addne   r5, r5, #1  // one extra block if final != 0
 
vld1.8  {q0}, [r6]  // load counter
vrev32.8q1, q0
@@ -865,19 +866,20 @@ ENTRY(aesbs_ctr_encrypt)
veorq2, q2, q14
vst1.8  {q2}, [r0]!
teq r4, #0  // skip last block if 'final'
-   W(bne)  4f
+   W(bne)  5f
 3: veorq5, q5, q15
vst1.8  {q5}, [r0]!
 
-   next_ctrq0
+4: next_ctrq0
 
subsr5, r5, #8
bgt 99b
 
-   vmovq5, q0
-
-4: vst1.8  {q5}, [r6]
+   vst1.8  {q0}, [r6]
pop {r4-r10, pc}
+
+5: vst1.8  {q5}, [r4]
+   b   4b
 ENDPROC(aesbs_ctr_encrypt)
 
.macro  next_tweak, out, in, const, tmp
diff --git a/arch/arm/crypto/aes-neonbs-glue.c 
b/arch/arm/crypto/aes-neonbs-glue.c
index e262f99a44d3..2920b96dbd36 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -35,7 +35,7 @@ asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 
const rk[],
  int rounds, int blocks, u8 iv[]);
 
 asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, u8 ctr[], bool final);
+ int rounds, int blocks, u8 ctr[], u8 final[]);
 
 asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
  int rounds, int blocks, u8 iv[]);
@@ -186,6 +186,7 @@ static int ctr_encrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
+   u8 buf[AES_BLOCK_SIZE];
int err;
 
err = skcipher_walk_virt(, req, true);
@@ -193,12 +194,12 @@ static int ctr_encrypt(struct skcipher_request *req)
kernel_neon_begin();
while (walk.nbytes > 0) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
-   bool final = (walk.total % AES_BLOCK_SIZE) != 0;
+   u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
 
if (walk.nbytes < walk.total) {
blocks = round_down(blocks,
walk.stride / AES_BLOCK_SIZE);
-   final = false;
+   final = NULL;
}
 
aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
@@ -210,7 +211,7 @@ static int ctr_encrypt(struct skcipher_request *req)
 
if (dst != src)
memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
-   crypto_xor(dst, walk.iv, walk.total % AES_BLOCK_SIZE);
+   crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
 
err = skcipher_walk_done(, 0);
break;
-- 
2.7.4



[PATCH 1/2] crypto: arm64/aes - don't use IV buffer to return final keystream block

2017-02-02 Thread Ard Biesheuvel
The arm64 bit sliced AES core code uses the IV buffer to pass the final
keystream block back to the glue code if the input is not a multiple of
the block size, so that the asm code does not have to deal with anything
except 16 byte blocks. This is done under the assumption that the outgoing
IV is meaningless anyway in this case, given that chaining is no longer
possible under these circumstances.

However, as it turns out, the CCM driver does expect the IV to retain
a value that is equal to the original IV except for the counter value,
and even interprets byte zero as a length indicator, which may result
in memory corruption if the IV is overwritten with something else.

So use a separate buffer to return the final keystream block.

Signed-off-by: Ard Biesheuvel 
---
Note that this patch includes the fix

crypto: arm64/aes-neon-bs - honour iv_out requirement in CTR mode

which I sent out earlier.

 arch/arm64/crypto/aes-neonbs-core.S | 37 
 arch/arm64/crypto/aes-neonbs-glue.c |  9 ++---
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/crypto/aes-neonbs-core.S 
b/arch/arm64/crypto/aes-neonbs-core.S
index 8d0cdaa2768d..ca0472500433 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -853,13 +853,15 @@ ENDPROC(aesbs_xts_decrypt)
 
/*
 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
-*   int rounds, int blocks, u8 iv[], bool final)
+*   int rounds, int blocks, u8 iv[], u8 final[])
 */
 ENTRY(aesbs_ctr_encrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
 
-   add x4, x4, x6  // do one extra block if final
+   cmp x6, #0
+   csetx10, ne
+   add x4, x4, x10 // do one extra block if final
 
ldp x7, x8, [x5]
ld1 {v0.16b}, [x5]
@@ -874,19 +876,26 @@ CPU_LE(   rev x8, x8  )
cselx4, x4, xzr, pl
cselx9, x9, xzr, le
 
+   tbnzx9, #1, 0f
next_ctrv1
+   tbnzx9, #2, 0f
next_ctrv2
+   tbnzx9, #3, 0f
next_ctrv3
+   tbnzx9, #4, 0f
next_ctrv4
+   tbnzx9, #5, 0f
next_ctrv5
+   tbnzx9, #6, 0f
next_ctrv6
+   tbnzx9, #7, 0f
next_ctrv7
 
 0: mov bskey, x2
mov rounds, x3
bl  aesbs_encrypt8
 
-   lsr x9, x9, x6  // disregard the extra block
+   lsr x9, x9, x10 // disregard the extra block
tbnzx9, #0, 0f
 
ld1 {v8.16b}, [x1], #16
@@ -928,36 +937,36 @@ CPU_LE(   rev x8, x8  )
eor v5.16b, v5.16b, v15.16b
st1 {v5.16b}, [x0], #16
 
-   next_ctrv0
+8: next_ctrv0
cbnzx4, 99b
 
 0: st1 {v0.16b}, [x5]
-8: ldp x29, x30, [sp], #16
+   ldp x29, x30, [sp], #16
ret
 
/*
-* If we are handling the tail of the input (x6 == 1), return the
-* final keystream block back to the caller via the IV buffer.
+* If we are handling the tail of the input (x6 != NULL), return the
+* final keystream block back to the caller.
 */
 1: cbz x6, 8b
-   st1 {v1.16b}, [x5]
+   st1 {v1.16b}, [x6]
b   8b
 2: cbz x6, 8b
-   st1 {v4.16b}, [x5]
+   st1 {v4.16b}, [x6]
b   8b
 3: cbz x6, 8b
-   st1 {v6.16b}, [x5]
+   st1 {v6.16b}, [x6]
b   8b
 4: cbz x6, 8b
-   st1 {v3.16b}, [x5]
+   st1 {v3.16b}, [x6]
b   8b
 5: cbz x6, 8b
-   st1 {v7.16b}, [x5]
+   st1 {v7.16b}, [x6]
b   8b
 6: cbz x6, 8b
-   st1 {v2.16b}, [x5]
+   st1 {v2.16b}, [x6]
b   8b
 7: cbz x6, 8b
-   st1 {v5.16b}, [x5]
+   st1 {v5.16b}, [x6]
b   8b
 ENDPROC(aesbs_ctr_encrypt)
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c 
b/arch/arm64/crypto/aes-neonbs-glue.c
index 863e436ecf89..db2501d93550 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -34,7 +34,7 @@ asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 
const rk[],
  int rounds, int blocks, u8 iv[]);
 
 asmlinkage void 

Re: crypto: NULL deref in sha512_mb_mgr_get_comp_job_avx2

2017-02-02 Thread Dmitry Vyukov
On Wed, Feb 1, 2017 at 7:45 PM, Tim Chen  wrote:
> On Tue, Jan 31, 2017 at 02:16:31PM +0100, Dmitry Vyukov wrote:
>> Hello,
>>
>> I am getting the following reports with low frequency while running
>> syzkaller fuzzer. Unfortunately they are not reproducible and happen
>> in a background thread, so it is difficult to extract any context on
>> my side. I see only few such crashes per week, so most likely it is
>> some hard to trigger data race. The following reports are from mmotm
>> tree, commits 00e20cfc2bf04a0cbe1f5405f61c8426f43eee84 and
>> fff7e71eac7788904753136f09bcad7471f7799e. Any ideas as to how this can
>> happen?
>>
>> BUG: unable to handle kernel NULL pointer dereference at 0060
>> IP: [] sha512_mb_mgr_get_comp_job_avx2+0x6e/0xee
>> arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S:251
>> PGD 1d2395067 [  220.874864] PUD 1d2860067
>> Oops: 0002 [#1] SMP KASAN
>> Dumping ftrace buffer:
>>(ftrace buffer empty)
>> Modules linked in:
>> CPU: 0 PID: 516 Comm: kworker/0:1 Not tainted 4.9.0 #4
>> Hardware name: Google Google Compute Engine/Google Compute Engine,
>> BIOS Google 01/01/2011
>> Workqueue: crypto mcryptd_queue_worker
>> task: 8801d9f346c0 task.stack: 8801d9f08000
>> RIP: 0010:[]  []
>> sha512_mb_mgr_get_comp_job_avx2+0x6e/0xee
>> arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S:251
>> RSP: 0018:8801d9f0eef8  EFLAGS: 00010202
>> RAX:  RBX: 8801d7db1190 RCX: 0006
>> RDX: 0001 RSI: 8801d9f34ee8 RDI: 8801d7db1040
>> RBP: 8801d9f0f258 R08: 0001 R09: 0001
>> R10: 0002 R11: 0003 R12: 8801d9f0f230
>> R13: 8801c8bbc4e0 R14: 8801c8bbc530 R15: 8801d9f0ef70
>> FS:  () GS:8801dc00() knlGS:
>> CS:  0010 DS:  ES:  CR0: 80050033
>> CR2: 0060 CR3: 0001cc15a000 CR4: 001406f0
>> DR0:  DR1:  DR2: 
>> DR3:  DR6: fffe0ff0 DR7: 0400
>> Stack:
>>  8801d7db1040 813fa207 dc00 e8c0f238
>>  0002 11003b3e1dea e8c0f218 8801d9f0f190
>>  0282 e8c0f140 e8c0f220 41b58ab3
>> Call Trace:
>>  [] sha512_mb_update+0x2f7/0x4e0
>> arch/x86/crypto/sha512-mb/sha512_mb.c:588
>>  [] crypto_ahash_update include/crypto/hash.h:512 [inline]
>>  [] ahash_mcryptd_update crypto/mcryptd.c:627 [inline]
>>  [] mcryptd_hash_update+0xcd/0x1c0 crypto/mcryptd.c:373
>>  [] mcryptd_queue_worker+0xff/0x6a0 crypto/mcryptd.c:181
>>  [] process_one_work+0xbd0/0x1c10 kernel/workqueue.c:2096
>>  [] worker_thread+0x223/0x1990 kernel/workqueue.c:2230
>>  [] kthread+0x323/0x3e0 kernel/kthread.c:209
>>  [] ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433
>> Code: 49 0f 42 d3 48 f7 c2 f0 ff ff ff 0f 85 9a 00 00 00 48 83 e2 0f
>> 48 6b da 08 48 8d 9c 1f 48 01 00 00 48 8b 03 48 c7 03 00 00 00 00 
>> 40 60 02 00 00 00 48 8b 9f 40 01 00 00 48 c1 e3 08 48 09 d3
>> RIP  [] sha512_mb_mgr_get_comp_job_avx2+0x6e/0xee
>> arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S:251
>>  RSP 
>> CR2: 0060
>> ---[ end trace 139fd4cda5dfe2c4 ]---
>>
>
> Dmitry,
>
> One theory that Mehga and I have is that perhaps the flusher
> and regular computaion updates are stepping on each other.
> Can you try this patch and see if it helps?


No, for this one I can't. Sorry.
It happens with very low frequency and only one fuzzer that tests
mmotm tree. If/when this is committed, I can keep an eye on these
reports and notify if I still see them.
If you have a hypothesis as to how it happens, perhaps you could write
a test that provokes the crash and maybe add some sleeps to kernel
code or alter timeouts to increase probability.



> --->8---
>
> From: Tim Chen 
> Subject: [PATCH] crypto/sha512-mb: Protect sha512 mb ctx mgr access
> To: Herbert Xu , Dmitry Vyukov 
> 
> Cc: Tim Chen , David Miller 
> , linux-crypto@vger.kernel.org, LKML 
> , megha@linux.intel.com, 
> fenghua...@intel.com
>
> The flusher and regular multi-buffer computation via mcryptd may race with 
> another.
> Add here a lock and turn off interrupt to to access multi-buffer
> computation state cstate->mgr before a round of computation. This should
> prevent the flusher code jumping in.
>
> Signed-off-by: Tim Chen 
> ---
>  arch/x86/crypto/sha512-mb/sha512_mb.c | 64 
> +++
>  1 file changed, 42 insertions(+), 22 deletions(-)
>
> diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c 
> b/arch/x86/crypto/sha512-mb/sha512_mb.c
> index d210174..f3c1c21 100644
> --- a/arch/x86/crypto/sha512-mb/sha512_mb.c
> +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
> @@ 

Re: [PATCH -stable] crypto: ccm - deal with CTR ciphers that honour iv_out

2017-02-02 Thread Herbert Xu
On Thu, Feb 02, 2017 at 08:01:47AM +, Ard Biesheuvel wrote:
>
> You are right: due to its construction, the CCM mode does not care
> about the incremented counter because it clears the counter part of
> the IV before encrypting the MAC. So this is caused by an optimization
> in my code rather than the CCM code being incorrect.

OK so you will send me an update for the ARM64 code, right?

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH -stable] crypto: ccm - deal with CTR ciphers that honour iv_out

2017-02-02 Thread Ard Biesheuvel
On 2 February 2017 at 05:13, Herbert Xu  wrote:
> On Wed, Feb 01, 2017 at 08:08:09PM +, Ard Biesheuvel wrote:
>>
>> Could you please forward this patch to Linus as well? I noticed that the 
>> patch
>
> Sure, I will do that.
>
>> crypto: arm64/aes-blk - honour iv_out requirement in CBC and CTR modes
>>
>> is now in mainline, which means CCM is now broken on arm64, given that
>> the iv_out requirement for CTR apparently isn't honored by *any*
>> implementation, and CCM wrongly assumes that req->iv retains its value
>> across the call into the CTR skcipher
>
> Hmm, I wonder why we don't see this breakage with the generic
> CTR as it seems to do exactly the same thing.
>

You are right: due to its construction, the CCM mode does not care
about the incremented counter because it clears the counter part of
the IV before encrypting the MAC. So this is caused by an optimization
in my code rather than the CCM code being incorrect.