[PATCH v2 net-next 5/6] tls: RX path for ktls

2018-03-22 Thread Dave Watson
Add rx path for tls software implementation.

recvmsg, splice_read, and poll implemented.

An additional sockopt TLS_RX is added, with the same interface as
TLS_TX.  Either TLX_RX or TLX_TX may be provided separately, or
together (with two different setsockopt calls with appropriate keys).

Control messages are passed via CMSG in a similar way to transmit.
If no cmsg buffer is passed, then only application data records
will be passed to userspace, and EIO is returned for other types of
alerts.

EBADMSG is passed for decryption errors, and EMSGSIZE is passed for
framing too big, and EBADMSG for framing too small (matching openssl
semantics). EINVAL is returned for TLS versions that do not match the
original setsockopt call.  All are unrecoverable.

strparser is used to parse TLS framing.   Decryption is done directly
in to userspace buffers if they are large enough to support it, otherwise
sk_cow_data is called (similar to ipsec), and buffers are decrypted in
place and copied.  splice_read always decrypts in place, since no
buffers are provided to decrypt in to.

sk_poll is overridden, and only returns POLLIN if a full TLS message is
received.  Otherwise we wait for strparser to finish reading a full frame.
Actual decryption is only done during recvmsg or splice_read calls.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tls.h|  27 ++-
 include/uapi/linux/tls.h |   2 +
 net/tls/Kconfig  |   1 +
 net/tls/tls_main.c   |  62 -
 net/tls/tls_sw.c | 587 ++-
 5 files changed, 609 insertions(+), 70 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 095b722..437a746 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -58,8 +59,18 @@
 
 struct tls_sw_context {
struct crypto_aead *aead_send;
+   struct crypto_aead *aead_recv;
struct crypto_wait async_wait;
 
+   /* Receive context */
+   struct strparser strp;
+   void (*saved_data_ready)(struct sock *sk);
+   unsigned int (*sk_poll)(struct file *file, struct socket *sock,
+   struct poll_table_struct *wait);
+   struct sk_buff *recv_pkt;
+   u8 control;
+   bool decrypted;
+
/* Sending context */
char aad_space[TLS_AAD_SPACE_SIZE];
 
@@ -96,12 +107,17 @@ struct tls_context {
struct tls_crypto_info crypto_send;
struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128;
};
+   union {
+   struct tls_crypto_info crypto_recv;
+   struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128;
+   };
 
void *priv_ctx;
 
u8 conf:2;
 
struct cipher_context tx;
+   struct cipher_context rx;
 
struct scatterlist *partially_sent_record;
u16 partially_sent_offset;
@@ -128,12 +144,19 @@ int tls_sk_attach(struct sock *sk, int optname, char 
__user *optval,
  unsigned int optlen);
 
 
-int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx);
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
 int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tls_sw_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
 void tls_sw_close(struct sock *sk, long timeout);
-void tls_sw_free_tx_resources(struct sock *sk);
+void tls_sw_free_resources(struct sock *sk);
+int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+  int nonblock, int flags, int *addr_len);
+unsigned int tls_sw_poll(struct file *file, struct socket *sock,
+struct poll_table_struct *wait);
+ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
+  struct pipe_inode_info *pipe,
+  size_t len, unsigned int flags);
 
 void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
 void tls_icsk_clean_acked(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index 293b2cd..c6633e9 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -38,6 +38,7 @@
 
 /* TLS socket options */
 #define TLS_TX 1   /* Set transmit parameters */
+#define TLS_RX 2   /* Set receive parameters */
 
 /* Supported versions */
 #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
@@ -59,6 +60,7 @@
 #define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE8
 
 #define TLS_SET_RECORD_TYPE1
+#define TLS_GET_RECORD_TYPE2
 
 struct tls_crypto_info {
__u16 version;
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index eb58303..89b8745a 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -7,6 +7,7 @@ config TLS
select CRYPTO
select CRYPTO_AES
select CRYPTO_GCM
+   select STREAM_PARSER
default n
 

[PATCH v2 net-next 3/6] tls: Pass error code explicitly to tls_err_abort

2018-03-22 Thread Dave Watson
Pass EBADMSG explicitly to tls_err_abort.  Receive path will
pass additional codes - EMSGSIZE if framing is larger than max
TLS record size, EINVAL if TLS version mismatch.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tls.h | 6 +++---
 net/tls/tls_sw.c  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 019e52d..6b44875 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -174,9 +174,9 @@ static inline bool tls_is_pending_open_record(struct 
tls_context *tls_ctx)
return tls_ctx->pending_open_record_frags;
 }
 
-static inline void tls_err_abort(struct sock *sk)
+static inline void tls_err_abort(struct sock *sk, int err)
 {
-   sk->sk_err = EBADMSG;
+   sk->sk_err = err;
sk->sk_error_report(sk);
 }
 
@@ -197,7 +197,7 @@ static inline void tls_advance_record_sn(struct sock *sk,
 struct cipher_context *ctx)
 {
if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size))
-   tls_err_abort(sk);
+   tls_err_abort(sk, EBADMSG);
tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
 ctx->iv_size);
 }
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 338d743..1c79d9a 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -214,7 +214,7 @@ static int tls_push_record(struct sock *sk, int flags,
/* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */
rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags);
if (rc < 0 && rc != -EAGAIN)
-   tls_err_abort(sk);
+   tls_err_abort(sk, EBADMSG);
 
tls_advance_record_sn(sk, _ctx->tx);
return rc;
-- 
2.9.5



[PATCH v2 net-next 1/6] tls: Generalize zerocopy_from_iter

2018-03-22 Thread Dave Watson
Refactor zerocopy_from_iter to take arguments for pages and size,
such that it can be used for both tx and rx. RX will also support
zerocopy direct to output iter, as long as the full message can
be copied at once (a large enough userspace buffer was provided).

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 net/tls/tls_sw.c | 31 +++
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 057a558..ca1d20d 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -226,23 +226,24 @@ static int tls_sw_push_pending_record(struct sock *sk, 
int flags)
 }
 
 static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
- int length)
+ int length, int *pages_used,
+ unsigned int *size_used,
+ struct scatterlist *to, int to_max_pages,
+ bool charge)
 {
-   struct tls_context *tls_ctx = tls_get_ctx(sk);
-   struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
struct page *pages[MAX_SKB_FRAGS];
 
size_t offset;
ssize_t copied, use;
int i = 0;
-   unsigned int size = ctx->sg_plaintext_size;
-   int num_elem = ctx->sg_plaintext_num_elem;
+   unsigned int size = *size_used;
+   int num_elem = *pages_used;
int rc = 0;
int maxpages;
 
while (length > 0) {
i = 0;
-   maxpages = ARRAY_SIZE(ctx->sg_plaintext_data) - num_elem;
+   maxpages = to_max_pages - num_elem;
if (maxpages == 0) {
rc = -EFAULT;
goto out;
@@ -262,10 +263,11 @@ static int zerocopy_from_iter(struct sock *sk, struct 
iov_iter *from,
while (copied) {
use = min_t(int, copied, PAGE_SIZE - offset);
 
-   sg_set_page(>sg_plaintext_data[num_elem],
+   sg_set_page([num_elem],
pages[i], use, offset);
-   sg_unmark_end(>sg_plaintext_data[num_elem]);
-   sk_mem_charge(sk, use);
+   sg_unmark_end([num_elem]);
+   if (charge)
+   sk_mem_charge(sk, use);
 
offset = 0;
copied -= use;
@@ -276,8 +278,9 @@ static int zerocopy_from_iter(struct sock *sk, struct 
iov_iter *from,
}
 
 out:
-   ctx->sg_plaintext_size = size;
-   ctx->sg_plaintext_num_elem = num_elem;
+   *size_used = size;
+   *pages_used = num_elem;
+
return rc;
 }
 
@@ -374,7 +377,11 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t size)
 
if (full_record || eor) {
ret = zerocopy_from_iter(sk, >msg_iter,
-try_to_copy);
+   try_to_copy, >sg_plaintext_num_elem,
+   >sg_plaintext_size,
+   ctx->sg_plaintext_data,
+   ARRAY_SIZE(ctx->sg_plaintext_data),
+   true);
if (ret)
goto fallback_to_reg_send;
 
-- 
2.9.5



[PATCH v2 net-next 6/6] tls: Add receive path documentation

2018-03-22 Thread Dave Watson
Add documentation on rx path setup and cmsg interface.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 Documentation/networking/tls.txt | 66 ++--
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/tls.txt b/Documentation/networking/tls.txt
index 77ed006..58b5ef7 100644
--- a/Documentation/networking/tls.txt
+++ b/Documentation/networking/tls.txt
@@ -48,6 +48,9 @@ the transmit and the receive into the kernel.
 
   setsockopt(sock, SOL_TLS, TLS_TX, _info, sizeof(crypto_info));
 
+Transmit and receive are set separately, but the setup is the same, using 
either
+TLS_TX or TLS_RX.
+
 Sending TLS application data
 
 
@@ -79,6 +82,28 @@ for memory), or the encryption will always succeed.  If 
send() returns
 -ENOMEM and some data was left on the socket buffer from a previous
 call using MSG_MORE, the MSG_MORE data is left on the socket buffer.
 
+Receiving TLS application data
+--
+
+After setting the TLS_RX socket option, all recv family socket calls
+are decrypted using TLS parameters provided.  A full TLS record must
+be received before decryption can happen.
+
+  char buffer[16384];
+  recv(sock, buffer, 16384);
+
+Received data is decrypted directly in to the user buffer if it is
+large enough, and no additional allocations occur.  If the userspace
+buffer is too small, data is decrypted in the kernel and copied to
+userspace.
+
+EINVAL is returned if the TLS version in the received message does not
+match the version passed in setsockopt.
+
+EMSGSIZE is returned if the received message is too big.
+
+EBADMSG is returned if decryption failed for any other reason.
+
 Send TLS control messages
 -
 
@@ -118,6 +143,43 @@ using a record of type @record_type.
 Control message data should be provided unencrypted, and will be
 encrypted by the kernel.
 
+Receiving TLS control messages
+--
+
+TLS control messages are passed in the userspace buffer, with message
+type passed via cmsg.  If no cmsg buffer is provided, an error is
+returned if a control message is received.  Data messages may be
+received without a cmsg buffer set.
+
+  char buffer[16384];
+  char cmsg[CMSG_SPACE(sizeof(unsigned char))];
+  struct msghdr msg = {0};
+  msg.msg_control = cmsg;
+  msg.msg_controllen = sizeof(cmsg);
+
+  struct iovec msg_iov;
+  msg_iov.iov_base = buffer;
+  msg_iov.iov_len = 16384;
+
+  msg.msg_iov = _iov;
+  msg.msg_iovlen = 1;
+
+  int ret = recvmsg(sock, , 0 /* flags */);
+
+  struct cmsghdr *cmsg = CMSG_FIRSTHDR();
+  if (cmsg->cmsg_level == SOL_TLS &&
+  cmsg->cmsg_type == TLS_GET_RECORD_TYPE) {
+  int record_type = *((unsigned char *)CMSG_DATA(cmsg));
+  // Do something with record_type, and control message data in
+  // buffer.
+  //
+  // Note that record_type may be == to application data (23).
+  } else {
+  // Buffer contains application data.
+  }
+
+recv will never return data from mixed types of TLS records.
+
 Integrating in to userspace TLS library
 ---
 
@@ -126,10 +188,10 @@ layer of a userspace TLS library.
 
 A patchset to OpenSSL to use ktls as the record layer is here:
 
-https://github.com/Mellanox/tls-openssl
+https://github.com/Mellanox/openssl/commits/tls_rx2
 
 An example of calling send directly after a handshake using
 gnutls.  Since it doesn't implement a full record layer, control
 messages are not supported:
 
-https://github.com/Mellanox/tls-af_ktls_tool
+https://github.com/ktls/af_ktls-tool/commits/RX
-- 
2.9.5



[PATCH v2 net-next 4/6] tls: Refactor variable names

2018-03-22 Thread Dave Watson
Several config variables are prefixed with tx, drop the prefix
since these will be used for both tx and rx.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tls.h  |  2 +-
 net/tls/tls_main.c | 26 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 6b44875..095b722 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -99,7 +99,7 @@ struct tls_context {
 
void *priv_ctx;
 
-   u8 tx_conf:2;
+   u8 conf:2;
 
struct cipher_context tx;
 
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index c671560..c405bee 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -52,7 +52,7 @@ enum {
 };
 
 enum {
-   TLS_BASE_TX,
+   TLS_BASE,
TLS_SW_TX,
TLS_NUM_CONFIG,
 };
@@ -65,7 +65,7 @@ static inline void update_sk_prot(struct sock *sk, struct 
tls_context *ctx)
 {
int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
 
-   sk->sk_prot = _prots[ip_ver][ctx->tx_conf];
+   sk->sk_prot = _prots[ip_ver][ctx->conf];
 }
 
 int wait_on_pending_writer(struct sock *sk, long *timeo)
@@ -238,7 +238,7 @@ static void tls_sk_proto_close(struct sock *sk, long 
timeout)
lock_sock(sk);
sk_proto_close = ctx->sk_proto_close;
 
-   if (ctx->tx_conf == TLS_BASE_TX) {
+   if (ctx->conf == TLS_BASE) {
kfree(ctx);
goto skip_tx_cleanup;
}
@@ -262,7 +262,7 @@ static void tls_sk_proto_close(struct sock *sk, long 
timeout)
kfree(ctx->tx.rec_seq);
kfree(ctx->tx.iv);
 
-   if (ctx->tx_conf == TLS_SW_TX)
+   if (ctx->conf == TLS_SW_TX)
tls_sw_free_tx_resources(sk);
 
 skip_tx_cleanup:
@@ -371,7 +371,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char 
__user *optval,
struct tls_crypto_info *crypto_info;
struct tls_context *ctx = tls_get_ctx(sk);
int rc = 0;
-   int tx_conf;
+   int conf;
 
if (!optval || (optlen < sizeof(*crypto_info))) {
rc = -EINVAL;
@@ -418,11 +418,11 @@ static int do_tls_setsockopt_tx(struct sock *sk, char 
__user *optval,
 
/* currently SW is default, we will have ethtool in future */
rc = tls_set_sw_offload(sk, ctx);
-   tx_conf = TLS_SW_TX;
+   conf = TLS_SW_TX;
if (rc)
goto err_crypto_info;
 
-   ctx->tx_conf = tx_conf;
+   ctx->conf = conf;
update_sk_prot(sk, ctx);
ctx->sk_write_space = sk->sk_write_space;
sk->sk_write_space = tls_write_space;
@@ -465,12 +465,12 @@ static int tls_setsockopt(struct sock *sk, int level, int 
optname,
 
 static void build_protos(struct proto *prot, struct proto *base)
 {
-   prot[TLS_BASE_TX] = *base;
-   prot[TLS_BASE_TX].setsockopt= tls_setsockopt;
-   prot[TLS_BASE_TX].getsockopt= tls_getsockopt;
-   prot[TLS_BASE_TX].close = tls_sk_proto_close;
+   prot[TLS_BASE] = *base;
+   prot[TLS_BASE].setsockopt   = tls_setsockopt;
+   prot[TLS_BASE].getsockopt   = tls_getsockopt;
+   prot[TLS_BASE].close= tls_sk_proto_close;
 
-   prot[TLS_SW_TX] = prot[TLS_BASE_TX];
+   prot[TLS_SW_TX] = prot[TLS_BASE];
prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg;
prot[TLS_SW_TX].sendpage= tls_sw_sendpage;
 }
@@ -513,7 +513,7 @@ static int tls_init(struct sock *sk)
mutex_unlock(_prot_mutex);
}
 
-   ctx->tx_conf = TLS_BASE_TX;
+   ctx->conf = TLS_BASE;
update_sk_prot(sk, ctx);
 out:
return rc;
-- 
2.9.5



[PATCH v2 net-next 2/6] tls: Move cipher info to a separate struct

2018-03-22 Thread Dave Watson
Separate tx crypto parameters to a separate cipher_context struct.
The same parameters will be used for rx using the same struct.

tls_advance_record_sn is modified to only take the cipher info.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tls.h  | 26 +---
 net/tls/tls_main.c |  8 
 net/tls/tls_sw.c   | 58 --
 3 files changed, 49 insertions(+), 43 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 4913430..019e52d 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -81,6 +81,16 @@ enum {
TLS_PENDING_CLOSED_RECORD
 };
 
+struct cipher_context {
+   u16 prepend_size;
+   u16 tag_size;
+   u16 overhead_size;
+   u16 iv_size;
+   char *iv;
+   u16 rec_seq_size;
+   char *rec_seq;
+};
+
 struct tls_context {
union {
struct tls_crypto_info crypto_send;
@@ -91,13 +101,7 @@ struct tls_context {
 
u8 tx_conf:2;
 
-   u16 prepend_size;
-   u16 tag_size;
-   u16 overhead_size;
-   u16 iv_size;
-   char *iv;
-   u16 rec_seq_size;
-   char *rec_seq;
+   struct cipher_context tx;
 
struct scatterlist *partially_sent_record;
u16 partially_sent_offset;
@@ -190,7 +194,7 @@ static inline bool tls_bigint_increment(unsigned char *seq, 
int len)
 }
 
 static inline void tls_advance_record_sn(struct sock *sk,
-struct tls_context *ctx)
+struct cipher_context *ctx)
 {
if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size))
tls_err_abort(sk);
@@ -203,9 +207,9 @@ static inline void tls_fill_prepend(struct tls_context *ctx,
 size_t plaintext_len,
 unsigned char record_type)
 {
-   size_t pkt_len, iv_size = ctx->iv_size;
+   size_t pkt_len, iv_size = ctx->tx.iv_size;
 
-   pkt_len = plaintext_len + iv_size + ctx->tag_size;
+   pkt_len = plaintext_len + iv_size + ctx->tx.tag_size;
 
/* we cover nonce explicit here as well, so buf should be of
 * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE
@@ -217,7 +221,7 @@ static inline void tls_fill_prepend(struct tls_context *ctx,
buf[3] = pkt_len >> 8;
buf[4] = pkt_len & 0xFF;
memcpy(buf + TLS_NONCE_OFFSET,
-  ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size);
+  ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size);
 }
 
 static inline void tls_make_aad(char *buf,
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index d824d54..c671560 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -259,8 +259,8 @@ static void tls_sk_proto_close(struct sock *sk, long 
timeout)
}
}
 
-   kfree(ctx->rec_seq);
-   kfree(ctx->iv);
+   kfree(ctx->tx.rec_seq);
+   kfree(ctx->tx.iv);
 
if (ctx->tx_conf == TLS_SW_TX)
tls_sw_free_tx_resources(sk);
@@ -319,9 +319,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char 
__user *optval,
}
lock_sock(sk);
memcpy(crypto_info_aes_gcm_128->iv,
-  ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+  ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
   TLS_CIPHER_AES_GCM_128_IV_SIZE);
-   memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->rec_seq,
+   memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq,
   TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
release_sock(sk);
if (copy_to_user(optval,
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index ca1d20d..338d743 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -79,7 +79,7 @@ static void trim_both_sgl(struct sock *sk, int target_size)
target_size);
 
if (target_size > 0)
-   target_size += tls_ctx->overhead_size;
+   target_size += tls_ctx->tx.overhead_size;
 
trim_sg(sk, ctx->sg_encrypted_data,
>sg_encrypted_num_elem,
@@ -152,21 +152,21 @@ static int tls_do_encryption(struct tls_context *tls_ctx,
if (!aead_req)
return -ENOMEM;
 
-   ctx->sg_encrypted_data[0].offset += tls_ctx->prepend_size;
-   ctx->sg_encrypted_data[0].length -= tls_ctx->prepend_size;
+   ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size;
+   ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size;
 
aead_request_set_tfm(aead_req, ctx->aead_send);
aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out,
-  data_len, tls_ctx->iv);
+  

[PATCH v2 net-next 0/6] TLS Rx

2018-03-22 Thread Dave Watson
TLS tcp socket RX implementation, to match existing TX code.

This patchset completes the software TLS socket, allowing full
bi-directional communication over TLS using normal socket syscalls,
after the handshake has been done in userspace.  Only the symmetric
encryption is done in the kernel.

This allows usage of TLS sockets from within the kernel (for example
with network block device, or from bpf).  Performance can be better
than userspace, with appropriate crypto routines [1].

sk->sk_socket->ops must be overridden to implement splice_read and
poll, but otherwise the interface & implementation match TX closely.
strparser is used to parse TLS framing on receive.

There are Openssl RX patches that work with this interface [2], as
well as a testing tool using the socket interface directly (without
cmsg support) [3].  An example tcp socket setup is:

  // Normal tcp socket connect/accept, and TLS handshake
  // using any TLS library.
  setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));

  struct tls12_crypto_info_aes_gcm_128 crypto_info_rx;
  // Fill in crypto_info based on negotiated keys.

  setsockopt(sock, SOL_TLS, TLS_RX, _info, sizeof(crypto_info_rx));
  // You can optionally TLX_TX as well.

  char buffer[16384];
  int ret = recv(sock, buffer, 16384);

  // cmsg can be received using recvmsg and a msg_control 
  // of type TLS_GET_RECORD_TYPE will be set.

V1 -> V2

* For too-small framing errors, return EBADMSG, to match openssl error
  code semantics.  Docs and commit logs about this also updated.

RFC -> V1

* Refactor 'tx' variable names to drop tx
* Error return codes changed per discussion
* Only call skb_cow_data based on in-place decryption, 
  drop unnecessary frag list check.

[1] Recent crypto patchset to remove copies, resulting in optimally
zero copies vs. userspace's one, vs. previous kernel's two.  

https://marc.info/?l=linux-crypto-vger=151931242406416=2

[2] https://github.com/Mellanox/openssl/commits/tls_rx2

[3] https://github.com/ktls/af_ktls-tool/tree/RX

Dave Watson (6):
  tls: Generalize zerocopy_from_iter
  tls: Move cipher info to a separate struct
  tls: Pass error code explicitly to tls_err_abort
  tls: Refactor variable names
  tls: RX path for ktls
  tls: Add receive path documentation

 Documentation/networking/tls.txt |  66 +++-
 include/net/tls.h|  61 ++--
 include/uapi/linux/tls.h |   2 +
 net/tls/Kconfig  |   1 +
 net/tls/tls_main.c   |  92 --
 net/tls/tls_sw.c | 644 ++-
 6 files changed, 740 insertions(+), 126 deletions(-)

-- 
2.9.5



[PATCH net-next 6/6] tls: Add receive path documentation

2018-03-20 Thread Dave Watson
Add documentation on rx path setup and cmsg interface.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 Documentation/networking/tls.txt | 67 ++--
 1 file changed, 65 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/tls.txt b/Documentation/networking/tls.txt
index 77ed006..6c39505 100644
--- a/Documentation/networking/tls.txt
+++ b/Documentation/networking/tls.txt
@@ -48,6 +48,9 @@ the transmit and the receive into the kernel.
 
   setsockopt(sock, SOL_TLS, TLS_TX, _info, sizeof(crypto_info));
 
+Transmit and receive are set separately, but the setup is the same, using 
either
+TLS_TX or TLS_RX.
+
 Sending TLS application data
 
 
@@ -79,6 +82,29 @@ for memory), or the encryption will always succeed.  If 
send() returns
 -ENOMEM and some data was left on the socket buffer from a previous
 call using MSG_MORE, the MSG_MORE data is left on the socket buffer.
 
+Receiving TLS application data
+--
+
+After setting the TLS_RX socket option, all recv family socket calls
+are decrypted using TLS parameters provided.  A full TLS record must
+be received before decryption can happen.
+
+  char buffer[16384];
+  recv(sock, buffer, 16384);
+
+Received data is decrypted directly in to the user buffer if it is
+large enough, and no additional allocations occur.  If the userspace
+buffer is too small, data is decrypted in the kernel and copied to
+userspace.
+
+EINVAL is returned if the TLS version in the received message does not
+match the version passed in setsockopt.
+
+EMSGSIZE is returned if the received message is too big, or too small
+when crypto overheads are included.
+
+EBADMSG is returned if decryption failed for any other reason.
+
 Send TLS control messages
 -
 
@@ -118,6 +144,43 @@ using a record of type @record_type.
 Control message data should be provided unencrypted, and will be
 encrypted by the kernel.
 
+Receiving TLS control messages
+--
+
+TLS control messages are passed in the userspace buffer, with message
+type passed via cmsg.  If no cmsg buffer is provided, an error is
+returned if a control message is received.  Data messages may be
+received without a cmsg buffer set.
+
+  char buffer[16384];
+  char cmsg[CMSG_SPACE(sizeof(unsigned char))];
+  struct msghdr msg = {0};
+  msg.msg_control = cmsg;
+  msg.msg_controllen = sizeof(cmsg);
+
+  struct iovec msg_iov;
+  msg_iov.iov_base = buffer;
+  msg_iov.iov_len = 16384;
+
+  msg.msg_iov = _iov;
+  msg.msg_iovlen = 1;
+
+  int ret = recvmsg(sock, , 0 /* flags */);
+
+  struct cmsghdr *cmsg = CMSG_FIRSTHDR();
+  if (cmsg->cmsg_level == SOL_TLS &&
+  cmsg->cmsg_type == TLS_GET_RECORD_TYPE) {
+  int record_type = *((unsigned char *)CMSG_DATA(cmsg));
+  // Do something with record_type, and control message data in
+  // buffer.
+  //
+  // Note that record_type may be == to application data (23).
+  } else {
+  // Buffer contains application data.
+  }
+
+recv will never return data from mixed types of TLS records.
+
 Integrating in to userspace TLS library
 ---
 
@@ -126,10 +189,10 @@ layer of a userspace TLS library.
 
 A patchset to OpenSSL to use ktls as the record layer is here:
 
-https://github.com/Mellanox/tls-openssl
+https://github.com/Mellanox/openssl/commits/tls_rx2
 
 An example of calling send directly after a handshake using
 gnutls.  Since it doesn't implement a full record layer, control
 messages are not supported:
 
-https://github.com/Mellanox/tls-af_ktls_tool
+https://github.com/ktls/af_ktls-tool/commits/RX
-- 
2.9.5



[PATCH net-next 4/6] tls: Refactor variable names

2018-03-20 Thread Dave Watson
Several config variables are prefixed with tx, drop the prefix
since these will be used for both tx and rx.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tls.h  |  2 +-
 net/tls/tls_main.c | 26 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 6b44875..095b722 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -99,7 +99,7 @@ struct tls_context {
 
void *priv_ctx;
 
-   u8 tx_conf:2;
+   u8 conf:2;
 
struct cipher_context tx;
 
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index c671560..c405bee 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -52,7 +52,7 @@ enum {
 };
 
 enum {
-   TLS_BASE_TX,
+   TLS_BASE,
TLS_SW_TX,
TLS_NUM_CONFIG,
 };
@@ -65,7 +65,7 @@ static inline void update_sk_prot(struct sock *sk, struct 
tls_context *ctx)
 {
int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
 
-   sk->sk_prot = _prots[ip_ver][ctx->tx_conf];
+   sk->sk_prot = _prots[ip_ver][ctx->conf];
 }
 
 int wait_on_pending_writer(struct sock *sk, long *timeo)
@@ -238,7 +238,7 @@ static void tls_sk_proto_close(struct sock *sk, long 
timeout)
lock_sock(sk);
sk_proto_close = ctx->sk_proto_close;
 
-   if (ctx->tx_conf == TLS_BASE_TX) {
+   if (ctx->conf == TLS_BASE) {
kfree(ctx);
goto skip_tx_cleanup;
}
@@ -262,7 +262,7 @@ static void tls_sk_proto_close(struct sock *sk, long 
timeout)
kfree(ctx->tx.rec_seq);
kfree(ctx->tx.iv);
 
-   if (ctx->tx_conf == TLS_SW_TX)
+   if (ctx->conf == TLS_SW_TX)
tls_sw_free_tx_resources(sk);
 
 skip_tx_cleanup:
@@ -371,7 +371,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char 
__user *optval,
struct tls_crypto_info *crypto_info;
struct tls_context *ctx = tls_get_ctx(sk);
int rc = 0;
-   int tx_conf;
+   int conf;
 
if (!optval || (optlen < sizeof(*crypto_info))) {
rc = -EINVAL;
@@ -418,11 +418,11 @@ static int do_tls_setsockopt_tx(struct sock *sk, char 
__user *optval,
 
/* currently SW is default, we will have ethtool in future */
rc = tls_set_sw_offload(sk, ctx);
-   tx_conf = TLS_SW_TX;
+   conf = TLS_SW_TX;
if (rc)
goto err_crypto_info;
 
-   ctx->tx_conf = tx_conf;
+   ctx->conf = conf;
update_sk_prot(sk, ctx);
ctx->sk_write_space = sk->sk_write_space;
sk->sk_write_space = tls_write_space;
@@ -465,12 +465,12 @@ static int tls_setsockopt(struct sock *sk, int level, int 
optname,
 
 static void build_protos(struct proto *prot, struct proto *base)
 {
-   prot[TLS_BASE_TX] = *base;
-   prot[TLS_BASE_TX].setsockopt= tls_setsockopt;
-   prot[TLS_BASE_TX].getsockopt= tls_getsockopt;
-   prot[TLS_BASE_TX].close = tls_sk_proto_close;
+   prot[TLS_BASE] = *base;
+   prot[TLS_BASE].setsockopt   = tls_setsockopt;
+   prot[TLS_BASE].getsockopt   = tls_getsockopt;
+   prot[TLS_BASE].close= tls_sk_proto_close;
 
-   prot[TLS_SW_TX] = prot[TLS_BASE_TX];
+   prot[TLS_SW_TX] = prot[TLS_BASE];
prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg;
prot[TLS_SW_TX].sendpage= tls_sw_sendpage;
 }
@@ -513,7 +513,7 @@ static int tls_init(struct sock *sk)
mutex_unlock(_prot_mutex);
}
 
-   ctx->tx_conf = TLS_BASE_TX;
+   ctx->conf = TLS_BASE;
update_sk_prot(sk, ctx);
 out:
return rc;
-- 
2.9.5



[PATCH net-next 5/6] tls: RX path for ktls

2018-03-20 Thread Dave Watson
Add rx path for tls software implementation.

recvmsg, splice_read, and poll implemented.

An additional sockopt TLS_RX is added, with the same interface as
TLS_TX.  Either TLX_RX or TLX_TX may be provided separately, or
together (with two different setsockopt calls with appropriate keys).

Control messages are passed via CMSG in a similar way to transmit.
If no cmsg buffer is passed, then only application data records
will be passed to userspace, and EIO is returned for other types of
alerts.

EBADMSG is passed for decryption errors, and EMSGSIZE is passed for
framing errors (either framing too big *or* too small with crypto
overhead). EINVAL is returned for TLS versions that do not match the
original setsockopt call.  All are unrecoverable.

strparser is used to parse TLS framing.   Decryption is done directly
in to userspace buffers if they are large enough to support it, otherwise
sk_cow_data is called (similar to ipsec), and buffers are decrypted in
place and copied.  splice_read always decrypts in place, since no
buffers are provided to decrypt in to.

sk_poll is overridden, and only returns POLLIN if a full TLS message is
received.  Otherwise we wait for strparser to finish reading a full frame.
Actual decryption is only done during recvmsg or splice_read calls.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tls.h|  27 ++-
 include/uapi/linux/tls.h |   2 +
 net/tls/Kconfig  |   1 +
 net/tls/tls_main.c   |  62 -
 net/tls/tls_sw.c | 587 ++-
 5 files changed, 609 insertions(+), 70 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 095b722..437a746 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -58,8 +59,18 @@
 
 struct tls_sw_context {
struct crypto_aead *aead_send;
+   struct crypto_aead *aead_recv;
struct crypto_wait async_wait;
 
+   /* Receive context */
+   struct strparser strp;
+   void (*saved_data_ready)(struct sock *sk);
+   unsigned int (*sk_poll)(struct file *file, struct socket *sock,
+   struct poll_table_struct *wait);
+   struct sk_buff *recv_pkt;
+   u8 control;
+   bool decrypted;
+
/* Sending context */
char aad_space[TLS_AAD_SPACE_SIZE];
 
@@ -96,12 +107,17 @@ struct tls_context {
struct tls_crypto_info crypto_send;
struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128;
};
+   union {
+   struct tls_crypto_info crypto_recv;
+   struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128;
+   };
 
void *priv_ctx;
 
u8 conf:2;
 
struct cipher_context tx;
+   struct cipher_context rx;
 
struct scatterlist *partially_sent_record;
u16 partially_sent_offset;
@@ -128,12 +144,19 @@ int tls_sk_attach(struct sock *sk, int optname, char 
__user *optval,
  unsigned int optlen);
 
 
-int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx);
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
 int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tls_sw_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
 void tls_sw_close(struct sock *sk, long timeout);
-void tls_sw_free_tx_resources(struct sock *sk);
+void tls_sw_free_resources(struct sock *sk);
+int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+  int nonblock, int flags, int *addr_len);
+unsigned int tls_sw_poll(struct file *file, struct socket *sock,
+struct poll_table_struct *wait);
+ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
+  struct pipe_inode_info *pipe,
+  size_t len, unsigned int flags);
 
 void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
 void tls_icsk_clean_acked(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index 293b2cd..c6633e9 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -38,6 +38,7 @@
 
 /* TLS socket options */
 #define TLS_TX 1   /* Set transmit parameters */
+#define TLS_RX 2   /* Set receive parameters */
 
 /* Supported versions */
 #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
@@ -59,6 +60,7 @@
 #define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE8
 
 #define TLS_SET_RECORD_TYPE1
+#define TLS_GET_RECORD_TYPE2
 
 struct tls_crypto_info {
__u16 version;
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index eb58303..89b8745a 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -7,6 +7,7 @@ config TLS
select CRYPTO
select CRYPTO_AES
select CRYPTO_GCM
+   select STREAM_PARSER
default n
 

[PATCH net-next 3/6] tls: Pass error code explicitly to tls_err_abort

2018-03-20 Thread Dave Watson
Pass EBADMSG explicitly to tls_err_abort.  Receive path will
pass additional codes - EMSGSIZE if framing is larger than max
TLS record size, EINVAL if TLS version mismatch.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tls.h | 6 +++---
 net/tls/tls_sw.c  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 019e52d..6b44875 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -174,9 +174,9 @@ static inline bool tls_is_pending_open_record(struct 
tls_context *tls_ctx)
return tls_ctx->pending_open_record_frags;
 }
 
-static inline void tls_err_abort(struct sock *sk)
+static inline void tls_err_abort(struct sock *sk, int err)
 {
-   sk->sk_err = EBADMSG;
+   sk->sk_err = err;
sk->sk_error_report(sk);
 }
 
@@ -197,7 +197,7 @@ static inline void tls_advance_record_sn(struct sock *sk,
 struct cipher_context *ctx)
 {
if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size))
-   tls_err_abort(sk);
+   tls_err_abort(sk, EBADMSG);
tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
 ctx->iv_size);
 }
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index dd4441d..6a0a669 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -269,7 +269,7 @@ static int tls_push_record(struct sock *sk, int flags,
/* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */
rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags);
if (rc < 0 && rc != -EAGAIN)
-   tls_err_abort(sk);
+   tls_err_abort(sk, EBADMSG);
 
tls_advance_record_sn(sk, _ctx->tx);
return rc;
-- 
2.9.5



[PATCH net-next 2/6] tls: Move cipher info to a separate struct

2018-03-20 Thread Dave Watson
Separate tx crypto parameters to a separate cipher_context struct.
The same parameters will be used for rx using the same struct.

tls_advance_record_sn is modified to only take the cipher info.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tls.h  | 26 +---
 net/tls/tls_main.c |  8 
 net/tls/tls_sw.c   | 58 --
 3 files changed, 49 insertions(+), 43 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 4913430..019e52d 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -81,6 +81,16 @@ enum {
TLS_PENDING_CLOSED_RECORD
 };
 
+struct cipher_context {
+   u16 prepend_size;
+   u16 tag_size;
+   u16 overhead_size;
+   u16 iv_size;
+   char *iv;
+   u16 rec_seq_size;
+   char *rec_seq;
+};
+
 struct tls_context {
union {
struct tls_crypto_info crypto_send;
@@ -91,13 +101,7 @@ struct tls_context {
 
u8 tx_conf:2;
 
-   u16 prepend_size;
-   u16 tag_size;
-   u16 overhead_size;
-   u16 iv_size;
-   char *iv;
-   u16 rec_seq_size;
-   char *rec_seq;
+   struct cipher_context tx;
 
struct scatterlist *partially_sent_record;
u16 partially_sent_offset;
@@ -190,7 +194,7 @@ static inline bool tls_bigint_increment(unsigned char *seq, 
int len)
 }
 
 static inline void tls_advance_record_sn(struct sock *sk,
-struct tls_context *ctx)
+struct cipher_context *ctx)
 {
if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size))
tls_err_abort(sk);
@@ -203,9 +207,9 @@ static inline void tls_fill_prepend(struct tls_context *ctx,
 size_t plaintext_len,
 unsigned char record_type)
 {
-   size_t pkt_len, iv_size = ctx->iv_size;
+   size_t pkt_len, iv_size = ctx->tx.iv_size;
 
-   pkt_len = plaintext_len + iv_size + ctx->tag_size;
+   pkt_len = plaintext_len + iv_size + ctx->tx.tag_size;
 
/* we cover nonce explicit here as well, so buf should be of
 * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE
@@ -217,7 +221,7 @@ static inline void tls_fill_prepend(struct tls_context *ctx,
buf[3] = pkt_len >> 8;
buf[4] = pkt_len & 0xFF;
memcpy(buf + TLS_NONCE_OFFSET,
-  ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size);
+  ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size);
 }
 
 static inline void tls_make_aad(char *buf,
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index d824d54..c671560 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -259,8 +259,8 @@ static void tls_sk_proto_close(struct sock *sk, long 
timeout)
}
}
 
-   kfree(ctx->rec_seq);
-   kfree(ctx->iv);
+   kfree(ctx->tx.rec_seq);
+   kfree(ctx->tx.iv);
 
if (ctx->tx_conf == TLS_SW_TX)
tls_sw_free_tx_resources(sk);
@@ -319,9 +319,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char 
__user *optval,
}
lock_sock(sk);
memcpy(crypto_info_aes_gcm_128->iv,
-  ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+  ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
   TLS_CIPHER_AES_GCM_128_IV_SIZE);
-   memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->rec_seq,
+   memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq,
   TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
release_sock(sk);
if (copy_to_user(optval,
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index d58f675..dd4441d 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -79,7 +79,7 @@ static void trim_both_sgl(struct sock *sk, int target_size)
target_size);
 
if (target_size > 0)
-   target_size += tls_ctx->overhead_size;
+   target_size += tls_ctx->tx.overhead_size;
 
trim_sg(sk, ctx->sg_encrypted_data,
>sg_encrypted_num_elem,
@@ -207,21 +207,21 @@ static int tls_do_encryption(struct tls_context *tls_ctx,
if (!aead_req)
return -ENOMEM;
 
-   ctx->sg_encrypted_data[0].offset += tls_ctx->prepend_size;
-   ctx->sg_encrypted_data[0].length -= tls_ctx->prepend_size;
+   ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size;
+   ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size;
 
aead_request_set_tfm(aead_req, ctx->aead_send);
aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out,
-  data_len, tls_ctx->iv);
+  

[PATCH net-next 1/6] tls: Generalize zerocopy_from_iter

2018-03-20 Thread Dave Watson
Refactor zerocopy_from_iter to take arguments for pages and size,
such that it can be used for both tx and rx. RX will also support
zerocopy direct to output iter, as long as the full message can
be copied at once (a large enough userspace buffer was provided).

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 net/tls/tls_sw.c | 31 +++
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f26376e..d58f675 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -281,23 +281,24 @@ static int tls_sw_push_pending_record(struct sock *sk, 
int flags)
 }
 
 static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
- int length)
+ int length, int *pages_used,
+ unsigned int *size_used,
+ struct scatterlist *to, int to_max_pages,
+ bool charge)
 {
-   struct tls_context *tls_ctx = tls_get_ctx(sk);
-   struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
struct page *pages[MAX_SKB_FRAGS];
 
size_t offset;
ssize_t copied, use;
int i = 0;
-   unsigned int size = ctx->sg_plaintext_size;
-   int num_elem = ctx->sg_plaintext_num_elem;
+   unsigned int size = *size_used;
+   int num_elem = *pages_used;
int rc = 0;
int maxpages;
 
while (length > 0) {
i = 0;
-   maxpages = ARRAY_SIZE(ctx->sg_plaintext_data) - num_elem;
+   maxpages = to_max_pages - num_elem;
if (maxpages == 0) {
rc = -EFAULT;
goto out;
@@ -317,10 +318,11 @@ static int zerocopy_from_iter(struct sock *sk, struct 
iov_iter *from,
while (copied) {
use = min_t(int, copied, PAGE_SIZE - offset);
 
-   sg_set_page(>sg_plaintext_data[num_elem],
+   sg_set_page([num_elem],
pages[i], use, offset);
-   sg_unmark_end(>sg_plaintext_data[num_elem]);
-   sk_mem_charge(sk, use);
+   sg_unmark_end([num_elem]);
+   if (charge)
+   sk_mem_charge(sk, use);
 
offset = 0;
copied -= use;
@@ -331,8 +333,9 @@ static int zerocopy_from_iter(struct sock *sk, struct 
iov_iter *from,
}
 
 out:
-   ctx->sg_plaintext_size = size;
-   ctx->sg_plaintext_num_elem = num_elem;
+   *size_used = size;
+   *pages_used = num_elem;
+
return rc;
 }
 
@@ -429,7 +432,11 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t size)
 
if (full_record || eor) {
ret = zerocopy_from_iter(sk, >msg_iter,
-try_to_copy);
+   try_to_copy, >sg_plaintext_num_elem,
+   >sg_plaintext_size,
+   ctx->sg_plaintext_data,
+   ARRAY_SIZE(ctx->sg_plaintext_data),
+   true);
if (ret)
goto fallback_to_reg_send;
 
-- 
2.9.5



[PATCH net-next 0/6] TLS Rx

2018-03-20 Thread Dave Watson
TLS tcp socket RX implementation, to match existing TX code.

This patchset completes the software TLS socket, allowing full
bi-directional communication over TLS using normal socket syscalls,
after the handshake has been done in userspace.  Only the symmetric
encryption is done in the kernel.

This allows usage of TLS sockets from within the kernel (for example
with network block device, or from bpf).  Performance can be better
than userspace, with appropriate crypto routines [1].

sk->sk_socket->ops must be overridden to implement splice_read and
poll, but otherwise the interface & implementation match TX closely.
strparser is used to parse TLS framing on receive.

There are Openssl RX patches that work with this interface [2], as
well as a testing tool using the socket interface directly (without
cmsg support) [3].  An example tcp socket setup is:

  // Normal tcp socket connect/accept, and TLS handshake
  // using any TLS library.
  setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));

  struct tls12_crypto_info_aes_gcm_128 crypto_info_rx;
  // Fill in crypto_info based on negotiated keys.

  setsockopt(sock, SOL_TLS, TLS_RX, _info, sizeof(crypto_info_rx));
  // You can optionally TLX_TX as well.

  char buffer[16384];
  int ret = recv(sock, buffer, 16384);

  // cmsg can be received using recvmsg and a msg_control 
  // of type TLS_GET_RECORD_TYPE will be set.

RFC -> V1

* Refactor 'tx' variable names to drop tx
* Error return codes changed per discussion
* Only call skb_cow_data based on in-place decryption, 
  drop unnecessary frag list check.

[1] Recent crypto patchset to remove copies, resulting in optimally
zero copies vs. userspace's one, vs. previous kernel's two.  

https://marc.info/?l=linux-crypto-vger=151931242406416=2

[2] https://github.com/Mellanox/openssl/commits/tls_rx2

[3] https://github.com/ktls/af_ktls-tool/tree/RX

Dave Watson (6):
  tls: Generalize zerocopy_from_iter
  tls: Move cipher info to a separate struct
  tls: Pass error code explicitly to tls_err_abort
  tls: Refactor variable names
  tls: RX path for ktls
  tls: Add receive path documentation

 Documentation/networking/tls.txt |  67 +++-
 include/net/tls.h|  61 ++--
 include/uapi/linux/tls.h |   2 +
 net/tls/Kconfig  |   1 +
 net/tls/tls_main.c   |  92 --
 net/tls/tls_sw.c | 644 ++-
 6 files changed, 741 insertions(+), 126 deletions(-)

-- 
2.9.5



Re: [PATCH RFC 4/5] tls: RX path for ktls

2018-03-08 Thread Dave Watson
On 03/08/18 09:48 PM, Boris Pismenny wrote:
> Hi Dave,
> 
> On 03/08/18 18:50, Dave Watson wrote:
> > Add rx path for tls software implementation.
> > 
> > recvmsg, splice_read, and poll implemented.
> > 
> > An additional sockopt TLS_RX is added, with the same interface as
> > TLS_TX.  Either TLX_RX or TLX_TX may be provided separately, or
> > together (with two different setsockopt calls with appropriate keys).
> > 
> > Control messages are passed via CMSG in a similar way to transmit.
> > If no cmsg buffer is passed, then only application data records
> > will be passed to userspace, and EIO is returned for other types of
> > alerts.
> > 
> > EBADMSG is passed for decryption errors, and E2BIG is passed for framing
> > errors.  Both are unrecoverable.
> 
> I think E2BIG is for too long argument list. EMSGSIZE might be more
> appropriate.

Sounds good.

> Also, we must check that the record is not too short (cipher specific).
> For TLS1.2 with AES-GCM the minimum length is 8 (IV) + 16 (TAG).
> The correct error for this case is EBADMSG, like a decryption failure.
> 
> Also, how about bad TLS version (e.g. not TLS1.2)?
> A separate error type is required for bad version, because it triggers a
> unique alert in libraries such as OpenSSL.
> I thought of using EINVAL for bad version. What do you think?

Ah, I did not realize there was a separate alert for that, sounds good.

> 
> I wonder if we should provide a more flexible method of obtaining errors for
> the future.
> Maybe use a special CMSG for errors?
> This CMSG will be triggered only after the socket enters the error state.

I'm not opposed to this in principle, but without a concrete use am
hesitant to add it.  I don't know of any other error codes that could
be returned besides the ones discussed above.

> > 
> > +
> > +int tls_sw_recvmsg(struct sock *sk,
> > +  struct msghdr *msg,
> > +  size_t len,
> > +  int nonblock,
> > +  int flags,
> > +  int *addr_len)
> > +{
> > +   struct tls_context *tls_ctx = tls_get_ctx(sk);
> > +   struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
> > +   unsigned char control;
> > +   struct strp_msg *rxm;
> > +   struct sk_buff *skb;
> > +   ssize_t copied = 0;
> > +   bool cmsg = false;
> > +   int err = 0;
> > +   long timeo;
> Maybe try to read from the error queue here?

Sure.



[PATCH RFC 5/5] tls: Add receive path documentation

2018-03-08 Thread Dave Watson
Add documentation on rx path setup and cmsg interface.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 Documentation/networking/tls.txt | 59 ++--
 1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/tls.txt b/Documentation/networking/tls.txt
index 77ed006..d341016 100644
--- a/Documentation/networking/tls.txt
+++ b/Documentation/networking/tls.txt
@@ -48,6 +48,9 @@ the transmit and the receive into the kernel.
 
   setsockopt(sock, SOL_TLS, TLS_TX, _info, sizeof(crypto_info));
 
+Transmit and receive are set separately, but the setup is the same, using 
either
+TLS_TX or TLS_RX.
+
 Sending TLS application data
 
 
@@ -79,6 +82,21 @@ for memory), or the encryption will always succeed.  If 
send() returns
 -ENOMEM and some data was left on the socket buffer from a previous
 call using MSG_MORE, the MSG_MORE data is left on the socket buffer.
 
+Receiving TLS application data
+--
+
+After setting the TLS_RX socket option, all recv family socket calls
+are decrypted using TLS parameters provided.  A full TLS record must
+be received before decryption can happen.
+
+  char buffer[16384];
+  recv(sock, buffer, 16384);
+
+Received data is decrypted directly in to the user buffer if it is
+large enough, and no additional allocations occur.  If the userspace
+buffer is too small, data is decrypted in the kernel and copied to
+userspace.
+
 Send TLS control messages
 -
 
@@ -118,6 +136,43 @@ using a record of type @record_type.
 Control message data should be provided unencrypted, and will be
 encrypted by the kernel.
 
+Receiving TLS control messages
+--
+
+TLS control messages are passed in the userspace buffer, with message
+type passed via cmsg.  If no cmsg buffer is provided, an error is
+returned if a control message is received.  Data messages may be
+received without a cmsg buffer set.
+
+  char buffer[16384];
+  char cmsg[CMSG_SPACE(sizeof(unsigned char))];
+  struct msghdr msg = {0};
+  msg.msg_control = cmsg;
+  msg.msg_controllen = sizeof(cmsg);
+
+  struct iovec msg_iov;
+  msg_iov.iov_base = buffer;
+  msg_iov.iov_len = 16384;
+
+  msg.msg_iov = _iov;
+  msg.msg_iovlen = 1;
+
+  int ret = recvmsg(sock, , 0 /* flags */);
+
+  struct cmsghdr *cmsg = CMSG_FIRSTHDR();
+  if (cmsg->cmsg_level == SOL_TLS &&
+  cmsg->cmsg_type == TLS_GET_RECORD_TYPE) {
+  int record_type = *((unsigned char *)CMSG_DATA(cmsg));
+  // Do something with record_type, and control message data in
+  // buffer.
+  //
+  // Note that record_type may be == to application data (23).
+  } else {
+// Buffer contains application data.
+  }
+
+recv will never return data from mixed types of TLS records.
+
 Integrating in to userspace TLS library
 ---
 
@@ -126,10 +181,10 @@ layer of a userspace TLS library.
 
 A patchset to OpenSSL to use ktls as the record layer is here:
 
-https://github.com/Mellanox/tls-openssl
+https://github.com/Mellanox/openssl/commits/tls_rx
 
 An example of calling send directly after a handshake using
 gnutls.  Since it doesn't implement a full record layer, control
 messages are not supported:
 
-https://github.com/Mellanox/tls-af_ktls_tool
+https://github.com/ktls/af_ktls-tool/commits/RX
-- 
2.9.5



[PATCH RFC 1/5] tls: Generalize zerocopy_from_iter

2018-03-08 Thread Dave Watson
Refactor zerocopy_from_iter to take arguments for pages and size,
such that it can be used for both tx and rx. RX will also support
zerocopy direct to output iter, as long as the full message can
be copied at once (a large enough userspace buffer was provided).

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 net/tls/tls_sw.c | 31 +++
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f26376e..d58f675 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -281,23 +281,24 @@ static int tls_sw_push_pending_record(struct sock *sk, 
int flags)
 }
 
 static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
- int length)
+ int length, int *pages_used,
+ unsigned int *size_used,
+ struct scatterlist *to, int to_max_pages,
+ bool charge)
 {
-   struct tls_context *tls_ctx = tls_get_ctx(sk);
-   struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
struct page *pages[MAX_SKB_FRAGS];
 
size_t offset;
ssize_t copied, use;
int i = 0;
-   unsigned int size = ctx->sg_plaintext_size;
-   int num_elem = ctx->sg_plaintext_num_elem;
+   unsigned int size = *size_used;
+   int num_elem = *pages_used;
int rc = 0;
int maxpages;
 
while (length > 0) {
i = 0;
-   maxpages = ARRAY_SIZE(ctx->sg_plaintext_data) - num_elem;
+   maxpages = to_max_pages - num_elem;
if (maxpages == 0) {
rc = -EFAULT;
goto out;
@@ -317,10 +318,11 @@ static int zerocopy_from_iter(struct sock *sk, struct 
iov_iter *from,
while (copied) {
use = min_t(int, copied, PAGE_SIZE - offset);
 
-   sg_set_page(>sg_plaintext_data[num_elem],
+   sg_set_page([num_elem],
pages[i], use, offset);
-   sg_unmark_end(>sg_plaintext_data[num_elem]);
-   sk_mem_charge(sk, use);
+   sg_unmark_end([num_elem]);
+   if (charge)
+   sk_mem_charge(sk, use);
 
offset = 0;
copied -= use;
@@ -331,8 +333,9 @@ static int zerocopy_from_iter(struct sock *sk, struct 
iov_iter *from,
}
 
 out:
-   ctx->sg_plaintext_size = size;
-   ctx->sg_plaintext_num_elem = num_elem;
+   *size_used = size;
+   *pages_used = num_elem;
+
return rc;
 }
 
@@ -429,7 +432,11 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t size)
 
if (full_record || eor) {
ret = zerocopy_from_iter(sk, >msg_iter,
-try_to_copy);
+   try_to_copy, >sg_plaintext_num_elem,
+   >sg_plaintext_size,
+   ctx->sg_plaintext_data,
+   ARRAY_SIZE(ctx->sg_plaintext_data),
+   true);
if (ret)
goto fallback_to_reg_send;
 
-- 
2.9.5



[PATCH RFC 4/5] tls: RX path for ktls

2018-03-08 Thread Dave Watson
Add rx path for tls software implementation.

recvmsg, splice_read, and poll implemented.

An additional sockopt TLS_RX is added, with the same interface as
TLS_TX.  Either TLX_RX or TLX_TX may be provided separately, or
together (with two different setsockopt calls with appropriate keys).

Control messages are passed via CMSG in a similar way to transmit.
If no cmsg buffer is passed, then only application data records
will be passed to userspace, and EIO is returned for other types of
alerts.

EBADMSG is passed for decryption errors, and E2BIG is passed for framing
errors.  Both are unrecoverable.

strparser is used to parse TLS framing.   Decryption is done directly
in to userspace buffers if they are large enough to support it, otherwise
sk_cow_data is called (similar to ipsec), and buffers are decrypted in
place and copied.  splice_read always decrypts in place, since no
buffers are provided to decrypt in to.

sk_poll is overridden, and only returns POLLIN if a full TLS message is
received.  Otherwise we wait for strparser to finish reading a full frame.
Actual decryption is only done during recvmsg or splice_read calls.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tls.h|  27 ++-
 include/uapi/linux/tls.h |   2 +
 net/tls/Kconfig  |   1 +
 net/tls/tls_main.c   |  62 -
 net/tls/tls_sw.c | 574 ++-
 5 files changed, 596 insertions(+), 70 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 6b44875..7202026 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -58,8 +59,18 @@
 
 struct tls_sw_context {
struct crypto_aead *aead_send;
+   struct crypto_aead *aead_recv;
struct crypto_wait async_wait;
 
+   /* Receive context */
+   struct strparser strp;
+   void (*saved_data_ready)(struct sock *sk);
+   unsigned int (*sk_poll)(struct file *file, struct socket *sock,
+   struct poll_table_struct *wait);
+   struct sk_buff *recv_pkt;
+   u8 control;
+   bool decrypted;
+
/* Sending context */
char aad_space[TLS_AAD_SPACE_SIZE];
 
@@ -96,12 +107,17 @@ struct tls_context {
struct tls_crypto_info crypto_send;
struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128;
};
+   union {
+   struct tls_crypto_info crypto_recv;
+   struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128;
+   };
 
void *priv_ctx;
 
u8 tx_conf:2;
 
struct cipher_context tx;
+   struct cipher_context rx;
 
struct scatterlist *partially_sent_record;
u16 partially_sent_offset;
@@ -128,12 +144,19 @@ int tls_sk_attach(struct sock *sk, int optname, char 
__user *optval,
  unsigned int optlen);
 
 
-int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx);
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
 int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tls_sw_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
 void tls_sw_close(struct sock *sk, long timeout);
-void tls_sw_free_tx_resources(struct sock *sk);
+void tls_sw_free_resources(struct sock *sk);
+int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+  int nonblock, int flags, int *addr_len);
+unsigned int tls_sw_poll(struct file *file, struct socket *sock,
+struct poll_table_struct *wait);
+ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
+  struct pipe_inode_info *pipe,
+  size_t len, unsigned int flags);
 
 void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
 void tls_icsk_clean_acked(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index 293b2cd..c6633e9 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -38,6 +38,7 @@
 
 /* TLS socket options */
 #define TLS_TX 1   /* Set transmit parameters */
+#define TLS_RX 2   /* Set receive parameters */
 
 /* Supported versions */
 #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
@@ -59,6 +60,7 @@
 #define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE8
 
 #define TLS_SET_RECORD_TYPE1
+#define TLS_GET_RECORD_TYPE2
 
 struct tls_crypto_info {
__u16 version;
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index eb58303..89b8745a 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -7,6 +7,7 @@ config TLS
select CRYPTO
select CRYPTO_AES
select CRYPTO_GCM
+   select STREAM_PARSER
default n
---help---
Enable kernel support for TLS protocol. This allows symmetric
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index c67156

[PATCH RFC 3/5] tls: Pass error code explicitly to tls_err_abort

2018-03-08 Thread Dave Watson
Pass EBADMSG explicitly to tls_err_abort.  Receive path will
pass additional codes - E2BIG if framing is larger than max
TLS record size.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tls.h | 6 +++---
 net/tls/tls_sw.c  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 019e52d..6b44875 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -174,9 +174,9 @@ static inline bool tls_is_pending_open_record(struct 
tls_context *tls_ctx)
return tls_ctx->pending_open_record_frags;
 }
 
-static inline void tls_err_abort(struct sock *sk)
+static inline void tls_err_abort(struct sock *sk, int err)
 {
-   sk->sk_err = EBADMSG;
+   sk->sk_err = err;
sk->sk_error_report(sk);
 }
 
@@ -197,7 +197,7 @@ static inline void tls_advance_record_sn(struct sock *sk,
 struct cipher_context *ctx)
 {
if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size))
-   tls_err_abort(sk);
+   tls_err_abort(sk, EBADMSG);
tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
 ctx->iv_size);
 }
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index dd4441d..6a0a669 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -269,7 +269,7 @@ static int tls_push_record(struct sock *sk, int flags,
/* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */
rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags);
if (rc < 0 && rc != -EAGAIN)
-   tls_err_abort(sk);
+   tls_err_abort(sk, EBADMSG);
 
tls_advance_record_sn(sk, _ctx->tx);
return rc;
-- 
2.9.5



[PATCH RFC 2/5] tls: Move cipher info to a separate struct

2018-03-08 Thread Dave Watson
Separate tx crypto parameters to a separate cipher_context struct.
The same parameters will be used for rx using the same struct.

tls_advance_record_sn is modified to only take the cipher info.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tls.h  | 26 +---
 net/tls/tls_main.c |  8 
 net/tls/tls_sw.c   | 58 --
 3 files changed, 49 insertions(+), 43 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 4913430..019e52d 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -81,6 +81,16 @@ enum {
TLS_PENDING_CLOSED_RECORD
 };
 
+struct cipher_context {
+   u16 prepend_size;
+   u16 tag_size;
+   u16 overhead_size;
+   u16 iv_size;
+   char *iv;
+   u16 rec_seq_size;
+   char *rec_seq;
+};
+
 struct tls_context {
union {
struct tls_crypto_info crypto_send;
@@ -91,13 +101,7 @@ struct tls_context {
 
u8 tx_conf:2;
 
-   u16 prepend_size;
-   u16 tag_size;
-   u16 overhead_size;
-   u16 iv_size;
-   char *iv;
-   u16 rec_seq_size;
-   char *rec_seq;
+   struct cipher_context tx;
 
struct scatterlist *partially_sent_record;
u16 partially_sent_offset;
@@ -190,7 +194,7 @@ static inline bool tls_bigint_increment(unsigned char *seq, 
int len)
 }
 
 static inline void tls_advance_record_sn(struct sock *sk,
-struct tls_context *ctx)
+struct cipher_context *ctx)
 {
if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size))
tls_err_abort(sk);
@@ -203,9 +207,9 @@ static inline void tls_fill_prepend(struct tls_context *ctx,
 size_t plaintext_len,
 unsigned char record_type)
 {
-   size_t pkt_len, iv_size = ctx->iv_size;
+   size_t pkt_len, iv_size = ctx->tx.iv_size;
 
-   pkt_len = plaintext_len + iv_size + ctx->tag_size;
+   pkt_len = plaintext_len + iv_size + ctx->tx.tag_size;
 
/* we cover nonce explicit here as well, so buf should be of
 * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE
@@ -217,7 +221,7 @@ static inline void tls_fill_prepend(struct tls_context *ctx,
buf[3] = pkt_len >> 8;
buf[4] = pkt_len & 0xFF;
memcpy(buf + TLS_NONCE_OFFSET,
-  ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size);
+  ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size);
 }
 
 static inline void tls_make_aad(char *buf,
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index d824d54..c671560 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -259,8 +259,8 @@ static void tls_sk_proto_close(struct sock *sk, long 
timeout)
}
}
 
-   kfree(ctx->rec_seq);
-   kfree(ctx->iv);
+   kfree(ctx->tx.rec_seq);
+   kfree(ctx->tx.iv);
 
if (ctx->tx_conf == TLS_SW_TX)
tls_sw_free_tx_resources(sk);
@@ -319,9 +319,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char 
__user *optval,
}
lock_sock(sk);
memcpy(crypto_info_aes_gcm_128->iv,
-  ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+  ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
   TLS_CIPHER_AES_GCM_128_IV_SIZE);
-   memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->rec_seq,
+   memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq,
   TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
release_sock(sk);
if (copy_to_user(optval,
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index d58f675..dd4441d 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -79,7 +79,7 @@ static void trim_both_sgl(struct sock *sk, int target_size)
target_size);
 
if (target_size > 0)
-   target_size += tls_ctx->overhead_size;
+   target_size += tls_ctx->tx.overhead_size;
 
trim_sg(sk, ctx->sg_encrypted_data,
>sg_encrypted_num_elem,
@@ -207,21 +207,21 @@ static int tls_do_encryption(struct tls_context *tls_ctx,
if (!aead_req)
return -ENOMEM;
 
-   ctx->sg_encrypted_data[0].offset += tls_ctx->prepend_size;
-   ctx->sg_encrypted_data[0].length -= tls_ctx->prepend_size;
+   ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size;
+   ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size;
 
aead_request_set_tfm(aead_req, ctx->aead_send);
aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out,
-  data_len, tls_ctx->iv);
+  

[PATCH RFC 0/5] TLX Rx

2018-03-08 Thread Dave Watson
TLS tcp socket RX implementation, to match existing TX code.

This patchset completes the software TLS socket, allowing full
bi-directional communication over TLS using normal socket syscalls,
after the handshake has been done in userspace.  Only the symmetric
encryption is done in the kernel.

This allows usage of TLS sockets from within the kernel (for example
with network block device, or from bpf).  Performance can be better
than userspace, with appropriate crypto routines [1].

sk->sk_socket->ops must be overridden to implement splice_read and
poll, but otherwise the interface & implementation match TX closely.
strparser is used to parse TLS framing on receive.

There are Openssl RX patches that work with this interface [2], as
well as a testing tool using the socket interface directly (without
cmsg support) [3].  An example tcp socket setup is:

  // Normal tcp socket connect/accept, and TLS handshake
  // using any TLS library.
  setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));

  struct tls12_crypto_info_aes_gcm_128 crypto_info_rx;
  // Fill in crypto_info based on negotiated keys.

  setsockopt(sock, SOL_TLS, TLS_RX, _info, sizeof(crypto_info_rx));
  // You can optionally TLX_TX as well.

  char buffer[16384];
  int ret = recv(sock, buffer, 16384);

  // cmsg can be received using recvmsg and a msg_control 
  // of type TLS_GET_RECORD_TYPE will be set.

[1] Recent crypto patchset to remove copies, resulting in optimally
zero copies vs. userspace's one, vs. previous kernel's two.  

https://marc.info/?l=linux-crypto-vger=151931242406416=2

[2] https://github.com/Mellanox/openssl/commits/tls_rx

[3] https://github.com/ktls/af_ktls-tool/tree/RX

Dave Watson (5):
  tls: Generalize zerocopy_from_iter
  tls: Move cipher info to a separate struct
  tls: Pass error code explicitly to tls_err_abort
  tls: RX path for ktls
  tls: Add receive path documentation

 Documentation/networking/tls.txt |  59 +++-
 include/net/tls.h|  59 +++-
 include/uapi/linux/tls.h |   2 +
 net/tls/Kconfig  |   1 +
 net/tls/tls_main.c   |  70 -
 net/tls/tls_sw.c | 631 ++-
 6 files changed, 708 insertions(+), 114 deletions(-)

-- 
2.9.5



Re: [Crypto v7 03/12] tls: support for inline tls

2018-02-23 Thread Dave Watson
On 02/23/18 04:58 PM, Atul Gupta wrote:
> > On 02/22/18 11:21 PM, Atul Gupta wrote:
> > > @@ -403,6 +431,15 @@ static int do_tls_setsockopt_tx(struct sock *sk, 
> > > char __user *optval,
> > >   goto err_crypto_info;
> > >   }
> > >  
> > > + rc = tls_offload_dev_absent(sk);
> > > + if (rc == -EINVAL) {
> > > + goto out;
> > > + } else if (rc == -EEXIST) {
> > > + /* Retain HW unhash for cleanup and move to SW Tx */
> > > + sk->sk_prot[TLS_BASE_TX].unhash =
> > > + sk->sk_prot[TLS_FULL_HW].unhash;
> > 
> > I'm still confused by this, it lookes like it is modifying the global 
> > tls_prots without taking a lock?  And modifying it for all sockets, not 
> > just this one?  One way to fix might be to always set an unhash in 
> > TLS_BASE_TX, and then have a function pointer unhash in ctx.
> 
> code enters do_tls_setsockopt_tx only for those offload capable dev which 
> does not define FULL_HW setsockopt as done by chtls, unhash prot update is 
> required for cleanup/revert of setup done in tls_hw_hash. This update does 
> not impact SW or other Inline HW path. 

I still don't follow.  If it doesn't impact SW, then what is it doing?
According to the comment, we're moving to SW tx, where sk_prot will be
_prot[TLS_SW_TX], and the unhash function you set here in
TLS_BASE_TX won't be called.



Re: [Crypto v7 03/12] tls: support for inline tls

2018-02-23 Thread Dave Watson
On 02/22/18 11:21 PM, Atul Gupta wrote:
> @@ -403,6 +431,15 @@ static int do_tls_setsockopt_tx(struct sock *sk, char 
> __user *optval,
>   goto err_crypto_info;
>   }
>  
> + rc = tls_offload_dev_absent(sk);
> + if (rc == -EINVAL) {
> + goto out;
> + } else if (rc == -EEXIST) {
> + /* Retain HW unhash for cleanup and move to SW Tx */
> + sk->sk_prot[TLS_BASE_TX].unhash =
> + sk->sk_prot[TLS_FULL_HW].unhash;

I'm still confused by this, it lookes like it is modifying the global
tls_prots without taking a lock?  And modifying it for all sockets,
not just this one?  One way to fix might be to always set an unhash in
TLS_BASE_TX, and then have a function pointer unhash in ctx.

> +static void tls_hw_unhash(struct sock *sk)
> +{
> + struct tls_device *dev;
> +
> + mutex_lock(_mutex);
> + list_for_each_entry(dev, _list, dev_list) {
> + if (dev->unhash)
> + dev->unhash(dev, sk);
> + }
> + mutex_unlock(_mutex);
> + sk->sk_prot->unhash(sk);

I would have thought unhash() here was tls_hw_unhash, doesn't the
original callback need to be saved like the other ones
(set/getsockopt, etc) in tls_init?  Similar for hash().

It looks like in patch 11 you directly call tcp_prot.hash/unhash, so
it doesn't have this issue.


Re: [Crypto v5 03/12] support for inline tls

2018-02-15 Thread Dave Watson
On 02/15/18 04:10 PM, Atul Gupta wrote:
> > -Original Message-
> > From: Dave Watson [mailto:davejwat...@fb.com] 
> > Sent: Thursday, February 15, 2018 9:22 PM
> > To: Atul Gupta <atul.gu...@chelsio.com>
> > Cc: da...@davemloft.net; herb...@gondor.apana.org.au; s...@queasysnail.net; 
> > linux-crypto@vger.kernel.org; net...@vger.kernel.org; Ganesh GR 
> > <ganes...@chelsio.com>
> > Subject: Re: [Crypto v5 03/12] support for inline tls
> > 
> > On 02/15/18 12:24 PM, Atul Gupta wrote:
> > > @@ -401,6 +430,15 @@ static int do_tls_setsockopt_tx(struct sock *sk, 
> > > char __user *optval,
> > >   goto out;
> > >   }
> > >  
> > > + rc = get_tls_offload_dev(sk);
> > > + if (rc) {
> > > + goto out;
> > > + } else {
> > > + /* Retain HW unhash for cleanup and move to SW Tx */
> > > + sk->sk_prot[TLS_BASE_TX].unhash =
> > > + sk->sk_prot[TLS_FULL_HW].unhash;
> > 
> > Isn't sk->sk_prot a pointer to a global shared struct here still?  It looks 
> > like this would actually modify the global struct, and not just for this sk.
> Yes, its global. It require add on check to modify only when tls_offload dev 
> list has an entry. I will revisit and correct. 
> 
> Can you look through other changes please?

I looked through 1,2,3,11 (the tls-related ones) and don't have any
other code comments.  Patch 11 commit message still mentions ULP,
could use updating / clarification.



Re: [Crypto v5 03/12] support for inline tls

2018-02-15 Thread Dave Watson
On 02/15/18 12:24 PM, Atul Gupta wrote:
> @@ -401,6 +430,15 @@ static int do_tls_setsockopt_tx(struct sock *sk, char 
> __user *optval,
>   goto out;
>   }
>  
> + rc = get_tls_offload_dev(sk);
> + if (rc) {
> + goto out;
> + } else {
> + /* Retain HW unhash for cleanup and move to SW Tx */
> + sk->sk_prot[TLS_BASE_TX].unhash =
> + sk->sk_prot[TLS_FULL_HW].unhash;

Isn't sk->sk_prot a pointer to a global shared struct here still?  It
looks like this would actually modify the global struct, and not just
for this sk.



[PATCH v2 03/14] x86/crypto: aesni: Add GCM_INIT macro

2018-02-14 Thread Dave Watson
Reduce code duplication by introducting GCM_INIT macro.  This macro
will also be exposed as a function for implementing scatter/gather
support, since INIT only needs to be called once for the full
operation.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 84 +++
 1 file changed, 33 insertions(+), 51 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 39b42b1..b9fe2ab 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -191,6 +191,37 @@ ALL_F:  .octa 0x
pop %r12
 .endm
 
+
+# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
+# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
+.macro GCM_INIT
+   mov %arg6, %r12
+   movdqu  (%r12), %xmm13
+   movdqa  SHUF_MASK(%rip), %xmm2
+   PSHUFB_XMM %xmm2, %xmm13
+
+   # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+   movdqa  %xmm13, %xmm2
+   psllq   $1, %xmm13
+   psrlq   $63, %xmm2
+   movdqa  %xmm2, %xmm1
+   pslldq  $8, %xmm2
+   psrldq  $8, %xmm1
+   por %xmm2, %xmm13
+
+   # reduce HashKey<<1
+
+   pshufd  $0x24, %xmm1, %xmm2
+   pcmpeqd TWOONE(%rip), %xmm2
+   pandPOLY(%rip), %xmm2
+   pxor%xmm2, %xmm13
+   movdqa  %xmm13, HashKey(%rsp)
+   mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod 
poly)
+   and $-16, %r13
+   mov %r13, %r12
+.endm
+
 #ifdef __x86_64__
 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 *
@@ -1151,36 +1182,11 @@ _esb_loop_\@:
 */
 ENTRY(aesni_gcm_dec)
FUNC_SAVE
-   mov %arg6, %r12
-   movdqu  (%r12), %xmm13# %xmm13 = HashKey
-movdqa  SHUF_MASK(%rip), %xmm2
-   PSHUFB_XMM %xmm2, %xmm13
-
-
-# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
-
-   movdqa  %xmm13, %xmm2
-   psllq   $1, %xmm13
-   psrlq   $63, %xmm2
-   movdqa  %xmm2, %xmm1
-   pslldq  $8, %xmm2
-   psrldq  $8, %xmm1
-   por %xmm2, %xmm13
-
-# Reduction
-
-   pshufd  $0x24, %xmm1, %xmm2
-   pcmpeqd TWOONE(%rip), %xmm2
-   pandPOLY(%rip), %xmm2
-   pxor%xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
 
+   GCM_INIT
 
 # Decrypt first few blocks
 
-   movdqa %xmm13, HashKey(%rsp)   # store HashKey<<1 (mod poly)
-   mov %arg4, %r13# save the number of bytes of plaintext/ciphertext
-   and $-16, %r13  # %r13 = %r13 - (%r13 mod 16)
-   mov %r13, %r12
and $(3<<4), %r12
jz _initial_num_blocks_is_0_decrypt
cmp $(2<<4), %r12
@@ -1402,32 +1408,8 @@ ENDPROC(aesni_gcm_dec)
 ***/
 ENTRY(aesni_gcm_enc)
FUNC_SAVE
-   mov %arg6, %r12
-   movdqu  (%r12), %xmm13
-movdqa  SHUF_MASK(%rip), %xmm2
-   PSHUFB_XMM %xmm2, %xmm13
-
-# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
-
-   movdqa  %xmm13, %xmm2
-   psllq   $1, %xmm13
-   psrlq   $63, %xmm2
-   movdqa  %xmm2, %xmm1
-   pslldq  $8, %xmm2
-   psrldq  $8, %xmm1
-   por %xmm2, %xmm13
-
-# reduce HashKey<<1
-
-   pshufd  $0x24, %xmm1, %xmm2
-   pcmpeqd TWOONE(%rip), %xmm2
-   pandPOLY(%rip), %xmm2
-   pxor%xmm2, %xmm13
-   movdqa  %xmm13, HashKey(%rsp)
-   mov %arg4, %r13# %xmm13 holds HashKey<<1 (mod poly)
-   and $-16, %r13
-   mov %r13, %r12
 
+   GCM_INIT
 # Encrypt first few blocks
 
and $(3<<4), %r12
-- 
2.9.5



[PATCH v2 05/14] x86/crypto: aesni: Merge encode and decode to GCM_ENC_DEC macro

2018-02-14 Thread Dave Watson
Make a macro for the main encode/decode routine.  Only a small handful
of lines differ for enc and dec.   This will also become the main
scatter/gather update routine.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 293 +++---
 1 file changed, 114 insertions(+), 179 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 529c542..8021fd1 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -222,6 +222,118 @@ ALL_F:  .octa 0x
mov %r13, %r12
 .endm
 
+# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
+# struct has been initialized by GCM_INIT.
+# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
+# Clobbers rax, r10-r13, and xmm0-xmm15
+.macro GCM_ENC_DEC operation
+   # Encrypt/Decrypt first few blocks
+
+   and $(3<<4), %r12
+   jz  _initial_num_blocks_is_0_\@
+   cmp $(2<<4), %r12
+   jb  _initial_num_blocks_is_1_\@
+   je  _initial_num_blocks_is_2_\@
+_initial_num_blocks_is_3_\@:
+   INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
+   sub $48, %r13
+   jmp _initial_blocks_\@
+_initial_num_blocks_is_2_\@:
+   INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
+   sub $32, %r13
+   jmp _initial_blocks_\@
+_initial_num_blocks_is_1_\@:
+   INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
+   sub $16, %r13
+   jmp _initial_blocks_\@
+_initial_num_blocks_is_0_\@:
+   INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
+_initial_blocks_\@:
+
+   # Main loop - Encrypt/Decrypt remaining blocks
+
+   cmp $0, %r13
+   je  _zero_cipher_left_\@
+   sub $64, %r13
+   je  _four_cipher_left_\@
+_crypt_by_4_\@:
+   GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
+   %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
+   %xmm7, %xmm8, enc
+   add $64, %r11
+   sub $64, %r13
+   jne _crypt_by_4_\@
+_four_cipher_left_\@:
+   GHASH_LAST_4%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_\@:
+   mov %arg4, %r13
+   and $15, %r13   # %r13 = arg4 (mod 16)
+   je  _multiple_of_16_bytes_\@
+
+   # Handle the last <16 Byte block separately
+   paddd ONE(%rip), %xmm0# INCR CNT to get Yn
+movdqa SHUF_MASK(%rip), %xmm10
+   PSHUFB_XMM %xmm10, %xmm0
+
+   ENCRYPT_SINGLE_BLOCK%xmm0, %xmm1# Encrypt(K, Yn)
+
+   lea (%arg3,%r11,1), %r10
+   mov %r13, %r12
+   READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
+
+   lea ALL_F+16(%rip), %r12
+   sub %r13, %r12
+.ifc \operation, dec
+   movdqa  %xmm1, %xmm2
+.endif
+   pxor%xmm1, %xmm0# XOR Encrypt(K, Yn)
+   movdqu  (%r12), %xmm1
+   # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+   pand%xmm1, %xmm0# mask out top 16-r13 bytes of xmm0
+.ifc \operation, dec
+   pand%xmm1, %xmm2
+   movdqa SHUF_MASK(%rip), %xmm10
+   PSHUFB_XMM %xmm10 ,%xmm2
+
+   pxor %xmm2, %xmm8
+.else
+   movdqa SHUF_MASK(%rip), %xmm10
+   PSHUFB_XMM %xmm10,%xmm0
+
+   pxor%xmm0, %xmm8
+.endif
+
+   GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+.ifc \operation, enc
+   # GHASH computation for the last <16 byte block
+   movdqa SHUF_MASK(%rip), %xmm10
+   # shuffle xmm0 back to output as ciphertext
+   PSHUFB_XMM %xmm10, %xmm0
+.endif
+
+   # Output %r13 bytes
+   MOVQ_R64_XMM %xmm0, %rax
+   cmp $8, %r13
+   jle _less_than_8_bytes_left_\@
+   mov %rax, (%arg2 , %r11, 1)
+   add $8, %r11
+   psrldq $8, %xmm0
+   MOVQ_R64_XMM %xmm0, %rax
+   sub $8, %r13
+_less_than_8_bytes_left_\@:
+   mov %al,  (%arg2, %r11, 1)
+   add $1, %r11
+   shr $8, %rax
+   sub $1, %r13
+   jne _less_than_8_bytes_left_\@
+_multiple_of_16_bytes_\@:
+.endm
+
 # GCM_COMPLETE Finishes update of tag of last partial block
 # Output: Authorization Tag (AUTH_TAG)
 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
@@ -1245,93 +1357,7 @@ ENTRY(aesni_gcm_dec)
FUNC_SAVE
 
GCM_INIT
-
-# Decrypt first few blocks
-
-   and $(3<<4), %r12
-   jz _initial_num_blocks_is_0_decrypt
- 

[PATCH v2 04/14] x86/crypto: aesni: Add GCM_COMPLETE macro

2018-02-14 Thread Dave Watson
Merge encode and decode tag calculations in GCM_COMPLETE macro.
Scatter/gather routines will call this once at the end of encryption
or decryption.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 172 ++
 1 file changed, 63 insertions(+), 109 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index b9fe2ab..529c542 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -222,6 +222,67 @@ ALL_F:  .octa 0x
mov %r13, %r12
 .endm
 
+# GCM_COMPLETE Finishes update of tag of last partial block
+# Output: Authorization Tag (AUTH_TAG)
+# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
+.macro GCM_COMPLETE
+   mov arg8, %r12# %r13 = aadLen (number of bytes)
+   shl $3, %r12  # convert into number of bits
+   movd%r12d, %xmm15 # len(A) in %xmm15
+   shl $3, %arg4 # len(C) in bits (*128)
+   MOVQ_R64_XMM%arg4, %xmm1
+   pslldq  $8, %xmm15# %xmm15 = len(A)||0x
+   pxor%xmm1, %xmm15 # %xmm15 = len(A)||len(C)
+   pxor%xmm15, %xmm8
+   GHASH_MUL   %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+   # final GHASH computation
+   movdqa SHUF_MASK(%rip), %xmm10
+   PSHUFB_XMM %xmm10, %xmm8
+
+   mov %arg5, %rax   # %rax = *Y0
+   movdqu  (%rax), %xmm0 # %xmm0 = Y0
+   ENCRYPT_SINGLE_BLOCK%xmm0,  %xmm1 # E(K, Y0)
+   pxor%xmm8, %xmm0
+_return_T_\@:
+   mov arg9, %r10 # %r10 = authTag
+   mov arg10, %r11# %r11 = auth_tag_len
+   cmp $16, %r11
+   je  _T_16_\@
+   cmp $8, %r11
+   jl  _T_4_\@
+_T_8_\@:
+   MOVQ_R64_XMM%xmm0, %rax
+   mov %rax, (%r10)
+   add $8, %r10
+   sub $8, %r11
+   psrldq  $8, %xmm0
+   cmp $0, %r11
+   je  _return_T_done_\@
+_T_4_\@:
+   movd%xmm0, %eax
+   mov %eax, (%r10)
+   add $4, %r10
+   sub $4, %r11
+   psrldq  $4, %xmm0
+   cmp $0, %r11
+   je  _return_T_done_\@
+_T_123_\@:
+   movd%xmm0, %eax
+   cmp $2, %r11
+   jl  _T_1_\@
+   mov %ax, (%r10)
+   cmp $2, %r11
+   je  _return_T_done_\@
+   add $2, %r10
+   sar $16, %eax
+_T_1_\@:
+   mov %al, (%r10)
+   jmp _return_T_done_\@
+_T_16_\@:
+   movdqu  %xmm0, (%r10)
+_return_T_done_\@:
+.endm
+
 #ifdef __x86_64__
 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 *
@@ -1271,61 +1332,7 @@ _less_than_8_bytes_left_decrypt:
sub $1, %r13
jne _less_than_8_bytes_left_decrypt
 _multiple_of_16_bytes_decrypt:
-   mov arg8, %r12# %r13 = aadLen (number of bytes)
-   shl $3, %r12  # convert into number of bits
-   movd%r12d, %xmm15 # len(A) in %xmm15
-   shl $3, %arg4 # len(C) in bits (*128)
-   MOVQ_R64_XMM%arg4, %xmm1
-   pslldq  $8, %xmm15# %xmm15 = len(A)||0x
-   pxor%xmm1, %xmm15 # %xmm15 = len(A)||len(C)
-   pxor%xmm15, %xmm8
-   GHASH_MUL   %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-# final GHASH computation
-movdqa SHUF_MASK(%rip), %xmm10
-   PSHUFB_XMM %xmm10, %xmm8
-
-   mov %arg5, %rax   # %rax = *Y0
-   movdqu  (%rax), %xmm0 # %xmm0 = Y0
-   ENCRYPT_SINGLE_BLOCK%xmm0,  %xmm1 # E(K, Y0)
-   pxor%xmm8, %xmm0
-_return_T_decrypt:
-   mov arg9, %r10# %r10 = authTag
-   mov arg10, %r11   # %r11 = auth_tag_len
-   cmp $16, %r11
-   je  _T_16_decrypt
-   cmp $8, %r11
-   jl  _T_4_decrypt
-_T_8_decrypt:
-   MOVQ_R64_XMM%xmm0, %rax
-   mov %rax, (%r10)
-   add $8, %r10
-   sub $8, %r11
-   psrldq  $8, %xmm0
-   cmp $0, %r11
-   je  _return_T_done_decrypt
-_T_4_decrypt:
-   movd%xmm0, %eax
-   mov %eax, (%r10)
-   add $4, %r10
-   sub $4, %r11
-   psrldq  $4, %xmm0
-   cmp $0, %r11
-   je  _return_T_done_decrypt
-_T_123_decrypt:
-   movd%xmm0, %eax
-   cmp $2, %r11
-   jl  _T_1_decrypt
-   mov %ax, (%r10)
-   cmp $2, %r11
-   je  _return_T_done_decrypt
-   add $2, %r10
-   sar $16, %eax
-_T_1_decrypt:
-   mov %al, (%r10)
-   jmp _return_T_done_decrypt
-_T_16_decrypt:
-   movdqu  %xmm0, (%r10)
-_return_T_done_decrypt:
+   GCM_COMPLETE
FUNC_RESTORE
ret
 E

[PATCH v2 07/14] x86/crypto: aesni: Split AAD hash calculation to separate macro

2018-02-14 Thread Dave Watson
AAD hash only needs to be calculated once for each scatter/gather operation.
Move it to its own macro, and call it from GCM_INIT instead of
INITIAL_BLOCKS.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 71 ---
 1 file changed, 43 insertions(+), 28 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 6c5a80d..58bbfac 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -229,6 +229,10 @@ ALL_F:  .octa 0x
mov %arg5, %r13 # %xmm13 holds HashKey<<1 (mod poly)
and $-16, %r13
mov %r13, %r12
+
+   CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
+   %xmm5 %xmm6
+   mov %r13, %r12
 .endm
 
 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
@@ -496,51 +500,62 @@ _read_next_byte_lt8_\@:
 _done_read_partial_block_\@:
 .endm
 
-/*
-* if a = number of total plaintext bytes
-* b = floor(a/16)
-* num_initial_blocks = b mod 4
-* encrypt the initial num_initial_blocks blocks and apply ghash on
-* the ciphertext
-* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
-* are clobbered
-* arg1, %arg3, %arg4, %r14 are used as a pointer only, not modified
-*/
-
-
-.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
-XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
-MOVADQ SHUF_MASK(%rip), %xmm14
-   movarg8, %r10   # %r10 = AAD
-   movarg9, %r11   # %r11 = aadLen
-   pxor   %xmm\i, %xmm\i
-   pxor   \XMM2, \XMM2
+# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+# clobbers r10-11, xmm14
+.macro CALC_AAD_HASH HASHKEY TMP1 TMP2 TMP3 TMP4 TMP5 \
+   TMP6 TMP7
+   MOVADQ SHUF_MASK(%rip), %xmm14
+   movarg8, %r10   # %r10 = AAD
+   movarg9, %r11   # %r11 = aadLen
+   pxor   \TMP7, \TMP7
+   pxor   \TMP6, \TMP6
 
cmp$16, %r11
jl _get_AAD_rest\@
 _get_AAD_blocks\@:
-   movdqu (%r10), %xmm\i
-   PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
-   pxor   %xmm\i, \XMM2
-   GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+   movdqu (%r10), \TMP7
+   PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
+   pxor   \TMP7, \TMP6
+   GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
add$16, %r10
sub$16, %r11
cmp$16, %r11
jge_get_AAD_blocks\@
 
-   movdqu \XMM2, %xmm\i
+   movdqu \TMP6, \TMP7
 
/* read the last <16B of AAD */
 _get_AAD_rest\@:
cmp$0, %r11
je _get_AAD_done\@
 
-   READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
-   PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
-   pxor   \XMM2, %xmm\i
-   GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+   READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
+   PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
+   pxor   \TMP6, \TMP7
+   GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
+   movdqu \TMP7, \TMP6
 
 _get_AAD_done\@:
+   movdqu \TMP6, AadHash(%arg2)
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+   XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+
+   movdqu AadHash(%arg2), %xmm\i   # XMM0 = Y0
+
xor%r11, %r11 # initialise the data pointer offset as zero
# start AES for num_initial_blocks blocks
 
-- 
2.9.5



[PATCH v2 08/14] x86/crypto: aesni: Fill in new context data structures

2018-02-14 Thread Dave Watson
Fill in aadhash, aadlen, pblocklen, curcount with appropriate values.
pblocklen, aadhash, and pblockenckey are also updated at the end
of each scatter/gather operation, to be carried over to the next
operation.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 51 ++-
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 58bbfac..aa82493 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -204,6 +204,21 @@ ALL_F:  .octa 0x
 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 .macro GCM_INIT
+
+   mov arg9, %r11
+   mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
+   xor %r11, %r11
+   mov %r11, InLen(%arg2) # ctx_data.in_length = 0
+   mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
+   mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
+   mov %arg6, %rax
+   movdqu (%rax), %xmm0
+   movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
+
+   movdqa  SHUF_MASK(%rip), %xmm2
+   PSHUFB_XMM %xmm2, %xmm0
+   movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
+
mov arg7, %r12
movdqu  (%r12), %xmm13
movdqa  SHUF_MASK(%rip), %xmm2
@@ -226,13 +241,9 @@ ALL_F:  .octa 0x
pandPOLY(%rip), %xmm2
pxor%xmm2, %xmm13
movdqa  %xmm13, HashKey(%rsp)
-   mov %arg5, %r13 # %xmm13 holds HashKey<<1 (mod poly)
-   and $-16, %r13
-   mov %r13, %r12
 
CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
%xmm5 %xmm6
-   mov %r13, %r12
 .endm
 
 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
@@ -240,6 +251,12 @@ ALL_F:  .octa 0x
 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 # Clobbers rax, r10-r13, and xmm0-xmm15
 .macro GCM_ENC_DEC operation
+   movdqu AadHash(%arg2), %xmm8
+   movdqu HashKey(%rsp), %xmm13
+   add %arg5, InLen(%arg2)
+   mov %arg5, %r13 # save the number of bytes
+   and $-16, %r13  # %r13 = %r13 - (%r13 mod 16)
+   mov %r13, %r12
# Encrypt/Decrypt first few blocks
 
and $(3<<4), %r12
@@ -284,16 +301,23 @@ _four_cipher_left_\@:
GHASH_LAST_4%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 _zero_cipher_left_\@:
+   movdqu %xmm8, AadHash(%arg2)
+   movdqu %xmm0, CurCount(%arg2)
+
mov %arg5, %r13
and $15, %r13   # %r13 = arg5 (mod 16)
je  _multiple_of_16_bytes_\@
 
+   mov %r13, PBlockLen(%arg2)
+
# Handle the last <16 Byte block separately
paddd ONE(%rip), %xmm0# INCR CNT to get Yn
+   movdqu %xmm0, CurCount(%arg2)
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm0
 
ENCRYPT_SINGLE_BLOCK%xmm0, %xmm1# Encrypt(K, Yn)
+   movdqu %xmm0, PBlockEncKey(%arg2)
 
lea (%arg4,%r11,1), %r10
mov %r13, %r12
@@ -322,6 +346,7 @@ _zero_cipher_left_\@:
 .endif
 
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+   movdqu %xmm8, AadHash(%arg2)
 .ifc \operation, enc
# GHASH computation for the last <16 byte block
movdqa SHUF_MASK(%rip), %xmm10
@@ -351,11 +376,15 @@ _multiple_of_16_bytes_\@:
 # Output: Authorization Tag (AUTH_TAG)
 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 .macro GCM_COMPLETE
-   mov arg9, %r12# %r13 = aadLen (number of bytes)
+   movdqu AadHash(%arg2), %xmm8
+   movdqu HashKey(%rsp), %xmm13
+   mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
shl $3, %r12  # convert into number of bits
movd%r12d, %xmm15 # len(A) in %xmm15
-   shl $3, %arg5 # len(C) in bits (*128)
-   MOVQ_R64_XMM%arg5, %xmm1
+   mov InLen(%arg2), %r12
+   shl $3, %r12  # len(C) in bits (*128)
+   MOVQ_R64_XMM%r12, %xmm1
+
pslldq  $8, %xmm15# %xmm15 = len(A)||0x
pxor%xmm1, %xmm15 # %xmm15 = len(A)||len(C)
pxor%xmm15, %xmm8
@@ -364,8 +393,7 @@ _multiple_of_16_bytes_\@:
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm8
 
-   mov %arg6, %rax   # %rax = *Y0
-   movdqu  (%rax), %xmm0 # %xmm0 = Y0
+   movdqu OrigIV(%arg2), %xmm0   # %xmm0 = Y0
ENCRYPT_SINGLE_BLOCK%xmm0,  %xmm1 # E(K, Y0)
pxor%xmm8, %xmm0
 _return_T_\@:
@@ -553,15 

[PATCH v2 06/14] x86/crypto: aesni: Introduce gcm_context_data

2018-02-14 Thread Dave Watson
Introduce a gcm_context_data struct that will be used to pass
context data between scatter/gather update calls.  It is passed
as the second argument (after crypto keys), other args are
renumbered.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S  | 115 +
 arch/x86/crypto/aesni-intel_glue.c |  81 ++
 2 files changed, 121 insertions(+), 75 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 8021fd1..6c5a80d 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -111,6 +111,14 @@ ALL_F:  .octa 0x
// (for Karatsuba purposes)
 #defineVARIABLE_OFFSET 16*8
 
+#define AadHash 16*0
+#define AadLen 16*1
+#define InLen (16*1)+8
+#define PBlockEncKey 16*2
+#define OrigIV 16*3
+#define CurCount 16*4
+#define PBlockLen 16*5
+
 #define arg1 rdi
 #define arg2 rsi
 #define arg3 rdx
@@ -121,6 +129,7 @@ ALL_F:  .octa 0x
 #define arg8 STACK_OFFSET+16(%r14)
 #define arg9 STACK_OFFSET+24(%r14)
 #define arg10 STACK_OFFSET+32(%r14)
+#define arg11 STACK_OFFSET+40(%r14)
 #define keysize 2*15*16(%arg1)
 #endif
 
@@ -195,9 +204,9 @@ ALL_F:  .octa 0x
 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 .macro GCM_INIT
-   mov %arg6, %r12
+   mov arg7, %r12
movdqu  (%r12), %xmm13
-   movdqa  SHUF_MASK(%rip), %xmm2
+   movdqa  SHUF_MASK(%rip), %xmm2
PSHUFB_XMM %xmm2, %xmm13
 
# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
@@ -217,7 +226,7 @@ ALL_F:  .octa 0x
pandPOLY(%rip), %xmm2
pxor%xmm2, %xmm13
movdqa  %xmm13, HashKey(%rsp)
-   mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod 
poly)
+   mov %arg5, %r13 # %xmm13 holds HashKey<<1 (mod poly)
and $-16, %r13
mov %r13, %r12
 .endm
@@ -271,18 +280,18 @@ _four_cipher_left_\@:
GHASH_LAST_4%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 _zero_cipher_left_\@:
-   mov %arg4, %r13
-   and $15, %r13   # %r13 = arg4 (mod 16)
+   mov %arg5, %r13
+   and $15, %r13   # %r13 = arg5 (mod 16)
je  _multiple_of_16_bytes_\@
 
# Handle the last <16 Byte block separately
paddd ONE(%rip), %xmm0# INCR CNT to get Yn
-movdqa SHUF_MASK(%rip), %xmm10
+   movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm0
 
ENCRYPT_SINGLE_BLOCK%xmm0, %xmm1# Encrypt(K, Yn)
 
-   lea (%arg3,%r11,1), %r10
+   lea (%arg4,%r11,1), %r10
mov %r13, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 
@@ -320,13 +329,13 @@ _zero_cipher_left_\@:
MOVQ_R64_XMM %xmm0, %rax
cmp $8, %r13
jle _less_than_8_bytes_left_\@
-   mov %rax, (%arg2 , %r11, 1)
+   mov %rax, (%arg3 , %r11, 1)
add $8, %r11
psrldq $8, %xmm0
MOVQ_R64_XMM %xmm0, %rax
sub $8, %r13
 _less_than_8_bytes_left_\@:
-   mov %al,  (%arg2, %r11, 1)
+   mov %al,  (%arg3, %r11, 1)
add $1, %r11
shr $8, %rax
sub $1, %r13
@@ -338,11 +347,11 @@ _multiple_of_16_bytes_\@:
 # Output: Authorization Tag (AUTH_TAG)
 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 .macro GCM_COMPLETE
-   mov arg8, %r12# %r13 = aadLen (number of bytes)
+   mov arg9, %r12# %r13 = aadLen (number of bytes)
shl $3, %r12  # convert into number of bits
movd%r12d, %xmm15 # len(A) in %xmm15
-   shl $3, %arg4 # len(C) in bits (*128)
-   MOVQ_R64_XMM%arg4, %xmm1
+   shl $3, %arg5 # len(C) in bits (*128)
+   MOVQ_R64_XMM%arg5, %xmm1
pslldq  $8, %xmm15# %xmm15 = len(A)||0x
pxor%xmm1, %xmm15 # %xmm15 = len(A)||len(C)
pxor%xmm15, %xmm8
@@ -351,13 +360,13 @@ _multiple_of_16_bytes_\@:
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm8
 
-   mov %arg5, %rax   # %rax = *Y0
+   mov %arg6, %rax   # %rax = *Y0
movdqu  (%rax), %xmm0 # %xmm0 = Y0
ENCRYPT_SINGLE_BLOCK%xmm0,  %xmm1 # E(K, Y0)
pxor%xmm8, %xmm0
 _return_T_\@:
-   mov arg9, %r10 # %r10 = authTag
-   mov arg10, %r11# %r11 = auth_tag_len
+   mov arg10, %r10 # %r10 = authTag

[PATCH v2 09/14] x86/crypto: aesni: Move ghash_mul to GCM_COMPLETE

2018-02-14 Thread Dave Watson
Prepare to handle partial blocks between scatter/gather calls.
For the last partial block, we only want to calculate the aadhash
in GCM_COMPLETE, and a new partial block macro will handle both
aadhash update and encrypting partial blocks between calls.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index aa82493..37b1cee 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -345,7 +345,6 @@ _zero_cipher_left_\@:
pxor%xmm0, %xmm8
 .endif
 
-   GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
movdqu %xmm8, AadHash(%arg2)
 .ifc \operation, enc
# GHASH computation for the last <16 byte block
@@ -378,6 +377,15 @@ _multiple_of_16_bytes_\@:
 .macro GCM_COMPLETE
movdqu AadHash(%arg2), %xmm8
movdqu HashKey(%rsp), %xmm13
+
+   mov PBlockLen(%arg2), %r12
+
+   cmp $0, %r12
+   je _partial_done\@
+
+   GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+
+_partial_done\@:
mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
shl $3, %r12  # convert into number of bits
movd%r12d, %xmm15 # len(A) in %xmm15
-- 
2.9.5



[PATCH v2 14/14] x86/crypto: aesni: Update aesni-intel_glue to use scatter/gather

2018-02-14 Thread Dave Watson
Add gcmaes_crypt_by_sg routine, that will do scatter/gather
by sg. Either src or dst may contain multiple buffers, so
iterate over both at the same time if they are different.
If the input is the same as the output, iterate only over one.

Currently both the AAD and TAG must be linear, so copy them out
with scatterlist_map_and_copy.  If first buffer contains the
entire AAD, we can optimize and not copy.   Since the AAD
can be any size, if copied it must be on the heap.  TAG can
be on the stack since it is always < 16 bytes.

Only the SSE routines are updated so far, so leave the previous
gcmaes_en/decrypt routines, and branch to the sg ones if the
keysize is inappropriate for avx, or we are SSE only.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_glue.c | 133 +
 1 file changed, 133 insertions(+)

diff --git a/arch/x86/crypto/aesni-intel_glue.c 
b/arch/x86/crypto/aesni-intel_glue.c
index de986f9..acbe7e8 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -791,6 +791,127 @@ static int generic_gcmaes_set_authsize(struct crypto_aead 
*tfm,
return 0;
 }
 
+static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
+ unsigned int assoclen, u8 *hash_subkey,
+ u8 *iv, void *aes_ctx)
+{
+   struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+   unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+   struct gcm_context_data data AESNI_ALIGN_ATTR;
+   struct scatter_walk dst_sg_walk = {};
+   unsigned long left = req->cryptlen;
+   unsigned long len, srclen, dstlen;
+   struct scatter_walk assoc_sg_walk;
+   struct scatter_walk src_sg_walk;
+   struct scatterlist src_start[2];
+   struct scatterlist dst_start[2];
+   struct scatterlist *src_sg;
+   struct scatterlist *dst_sg;
+   u8 *src, *dst, *assoc;
+   u8 *assocmem = NULL;
+   u8 authTag[16];
+
+   if (!enc)
+   left -= auth_tag_len;
+
+   /* Linearize assoc, if not already linear */
+   if (req->src->length >= assoclen && req->src->length &&
+   (!PageHighMem(sg_page(req->src)) ||
+   req->src->offset + req->src->length < PAGE_SIZE)) {
+   scatterwalk_start(_sg_walk, req->src);
+   assoc = scatterwalk_map(_sg_walk);
+   } else {
+   /* assoc can be any length, so must be on heap */
+   assocmem = kmalloc(assoclen, GFP_ATOMIC);
+   if (unlikely(!assocmem))
+   return -ENOMEM;
+   assoc = assocmem;
+
+   scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
+   }
+
+   src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
+   scatterwalk_start(_sg_walk, src_sg);
+   if (req->src != req->dst) {
+   dst_sg = scatterwalk_ffwd(dst_start, req->dst, req->assoclen);
+   scatterwalk_start(_sg_walk, dst_sg);
+   }
+
+   kernel_fpu_begin();
+   aesni_gcm_init(aes_ctx, , iv,
+   hash_subkey, assoc, assoclen);
+   if (req->src != req->dst) {
+   while (left) {
+   src = scatterwalk_map(_sg_walk);
+   dst = scatterwalk_map(_sg_walk);
+   srclen = scatterwalk_clamp(_sg_walk, left);
+   dstlen = scatterwalk_clamp(_sg_walk, left);
+   len = min(srclen, dstlen);
+   if (len) {
+   if (enc)
+   aesni_gcm_enc_update(aes_ctx, ,
+dst, src, len);
+   else
+   aesni_gcm_dec_update(aes_ctx, ,
+dst, src, len);
+   }
+   left -= len;
+
+   scatterwalk_unmap(src);
+   scatterwalk_unmap(dst);
+   scatterwalk_advance(_sg_walk, len);
+   scatterwalk_advance(_sg_walk, len);
+   scatterwalk_done(_sg_walk, 0, left);
+   scatterwalk_done(_sg_walk, 1, left);
+   }
+   } else {
+   while (left) {
+   dst = src = scatterwalk_map(_sg_walk);
+   len = scatterwalk_clamp(_sg_walk, left);
+   if (len) {
+   if (enc)
+   aesni_gcm_enc_update(aes_ctx, ,
+src, src, len);
+   else
+   aesni_gcm_dec_u

[PATCH v2 13/14] x86/crypto: aesni: Introduce scatter/gather asm function stubs

2018-02-14 Thread Dave Watson
The asm macros are all set up now, introduce entry points.

GCM_INIT and GCM_COMPLETE have arguments supplied, so that
the new scatter/gather entry points don't have to take all the
arguments, and only the ones they need.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S  | 116 -
 arch/x86/crypto/aesni-intel_glue.c |  16 +
 2 files changed, 106 insertions(+), 26 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index b941952..311b2de 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -200,8 +200,8 @@ ALL_F:  .octa 0x
 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
 # once per key.
 # clobbers r12, and tmp xmm registers.
-.macro PRECOMPUTE TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
-   mov arg7, %r12
+.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
+   mov \SUBKEY, %r12
movdqu  (%r12), \TMP3
movdqa  SHUF_MASK(%rip), \TMP2
PSHUFB_XMM \TMP2, \TMP3
@@ -254,14 +254,14 @@ ALL_F:  .octa 0x
 
 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
-.macro GCM_INIT
-   mov arg9, %r11
+.macro GCM_INIT Iv SUBKEY AAD AADLEN
+   mov \AADLEN, %r11
mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
xor %r11, %r11
mov %r11, InLen(%arg2) # ctx_data.in_length = 0
mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
-   mov %arg6, %rax
+   mov \Iv, %rax
movdqu (%rax), %xmm0
movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 
@@ -269,11 +269,11 @@ ALL_F:  .octa 0x
PSHUFB_XMM %xmm2, %xmm0
movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 
-   PRECOMPUTE %xmm1 %xmm2 %xmm3 %xmm4 %xmm5 %xmm6 %xmm7
+   PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
movdqa HashKey(%arg2), %xmm13
 
-   CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
-   %xmm5 %xmm6
+   CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
+   %xmm4, %xmm5, %xmm6
 .endm
 
 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
@@ -435,7 +435,7 @@ _multiple_of_16_bytes_\@:
 # GCM_COMPLETE Finishes update of tag of last partial block
 # Output: Authorization Tag (AUTH_TAG)
 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
-.macro GCM_COMPLETE
+.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
movdqu AadHash(%arg2), %xmm8
movdqu HashKey(%arg2), %xmm13
 
@@ -466,8 +466,8 @@ _partial_done\@:
ENCRYPT_SINGLE_BLOCK%xmm0,  %xmm1 # E(K, Y0)
pxor%xmm8, %xmm0
 _return_T_\@:
-   mov arg10, %r10 # %r10 = authTag
-   mov arg11, %r11# %r11 = auth_tag_len
+   mov \AUTHTAG, %r10 # %r10 = authTag
+   mov \AUTHTAGLEN, %r11# %r11 = auth_tag_len
cmp $16, %r11
je  _T_16_\@
cmp $8, %r11
@@ -599,11 +599,11 @@ _done_read_partial_block_\@:
 
 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 # clobbers r10-11, xmm14
-.macro CALC_AAD_HASH HASHKEY TMP1 TMP2 TMP3 TMP4 TMP5 \
+.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 TMP7
MOVADQ SHUF_MASK(%rip), %xmm14
-   movarg8, %r10   # %r10 = AAD
-   movarg9, %r11   # %r11 = aadLen
+   mov\AAD, %r10   # %r10 = AAD
+   mov\AADLEN, %r11# %r11 = aadLen
pxor   \TMP7, \TMP7
pxor   \TMP6, \TMP6
 
@@ -1103,18 +1103,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 
operation
mov   keysize,%eax
shr   $2,%eax   # 128->4, 192->6, 256->8
sub   $4,%eax   # 128->0, 192->2, 256->4
-   jzaes_loop_par_enc_done
+   jzaes_loop_par_enc_done\@
 
-aes_loop_par_enc:
+aes_loop_par_enc\@:
MOVADQ(%r10),\TMP3
 .irpc  index, 1234
AESENC\TMP3, %xmm\index
 .endr
add   $16,%r10
sub   $1,%eax
-   jnz   aes_loop_par_enc
+   jnz   aes_loop_par_enc\@
 
-aes_loop_par_enc_done:
+aes_loop_par_enc_done\@:
MOVADQ(%r10), \TMP3
AESENCLAST \TMP3, \XMM1   # Round 10
AESENCLAST \TMP3, \XMM2
@@ -1311,18 +1311,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 
operation
mov   keysize,%eax
shr   $2,%eax   # 128->4, 192->6, 256->8
sub   $4,%eax  

[PATCH v2 12/14] x86/crypto: aesni: Add fast path for > 16 byte update

2018-02-14 Thread Dave Watson
We can fast-path any < 16 byte read if the full message is > 16 bytes,
and shift over by the appropriate amount.  Usually we are
reading > 16 bytes, so this should be faster than the READ_PARTIAL
macro introduced in b20209c91e2 for the average case.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 25 +
 1 file changed, 25 insertions(+)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 398bd2237f..b941952 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -355,12 +355,37 @@ _zero_cipher_left_\@:
ENCRYPT_SINGLE_BLOCK%xmm0, %xmm1# Encrypt(K, Yn)
movdqu %xmm0, PBlockEncKey(%arg2)
 
+   cmp $16, %arg5
+   jge _large_enough_update_\@
+
lea (%arg4,%r11,1), %r10
mov %r13, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
+   jmp _data_read_\@
+
+_large_enough_update_\@:
+   sub $16, %r11
+   add %r13, %r11
+
+   # receive the last <16 Byte block
+   movdqu  (%arg4, %r11, 1), %xmm1
 
+   sub %r13, %r11
+   add $16, %r11
+
+   lea SHIFT_MASK+16(%rip), %r12
+   # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+   # (r13 is the number of bytes in plaintext mod 16)
+   sub %r13, %r12
+   # get the appropriate shuffle mask
+   movdqu  (%r12), %xmm2
+   # shift right 16-r13 bytes
+   PSHUFB_XMM  %xmm2, %xmm1
+
+_data_read_\@:
lea ALL_F+16(%rip), %r12
sub %r13, %r12
+
 .ifc \operation, dec
movdqa  %xmm1, %xmm2
 .endif
-- 
2.9.5



[PATCH v2 11/14] x86/crypto: aesni: Introduce partial block macro

2018-02-14 Thread Dave Watson
Before this diff, multiple calls to GCM_ENC_DEC will
succeed, but only if all calls are a multiple of 16 bytes.

Handle partial blocks at the start of GCM_ENC_DEC, and update
aadhash as appropriate.

The data offset %r11 is also updated after the partial block.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 151 +-
 1 file changed, 150 insertions(+), 1 deletion(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 3ada06b..398bd2237f 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -284,7 +284,13 @@ ALL_F:  .octa 0x
movdqu AadHash(%arg2), %xmm8
movdqu HashKey(%arg2), %xmm13
add %arg5, InLen(%arg2)
+
+   xor %r11, %r11 # initialise the data pointer offset as zero
+   PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
+
+   sub %r11, %arg5 # sub partial block data used
mov %arg5, %r13 # save the number of bytes
+
and $-16, %r13  # %r13 = %r13 - (%r13 mod 16)
mov %r13, %r12
# Encrypt/Decrypt first few blocks
@@ -605,6 +611,150 @@ _get_AAD_done\@:
movdqu \TMP6, AadHash(%arg2)
 .endm
 
+# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
+# between update calls.
+# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
+# Outputs encrypted bytes, and updates hash and partial info in 
gcm_data_context
+# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
+.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
+   AAD_HASH operation
+   mov PBlockLen(%arg2), %r13
+   cmp $0, %r13
+   je  _partial_block_done_\@  # Leave Macro if no partial blocks
+   # Read in input data without over reading
+   cmp $16, \PLAIN_CYPH_LEN
+   jl  _fewer_than_16_bytes_\@
+   movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
+   jmp _data_read_\@
+
+_fewer_than_16_bytes_\@:
+   lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
+   mov \PLAIN_CYPH_LEN, %r12
+   READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
+
+   mov PBlockLen(%arg2), %r13
+
+_data_read_\@: # Finished reading in data
+
+   movdqu  PBlockEncKey(%arg2), %xmm9
+   movdqu  HashKey(%arg2), %xmm13
+
+   lea SHIFT_MASK(%rip), %r12
+
+   # adjust the shuffle mask pointer to be able to shift r13 bytes
+   # r16-r13 is the number of bytes in plaintext mod 16)
+   add %r13, %r12
+   movdqu  (%r12), %xmm2   # get the appropriate shuffle mask
+   PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
+
+.ifc \operation, dec
+   movdqa  %xmm1, %xmm3
+   pxor%xmm1, %xmm9# Cyphertext XOR E(K, Yn)
+
+   mov \PLAIN_CYPH_LEN, %r10
+   add %r13, %r10
+   # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+   sub $16, %r10
+   # Determine if if partial block is not being filled and
+   # shift mask accordingly
+   jge _no_extra_mask_1_\@
+   sub %r10, %r12
+_no_extra_mask_1_\@:
+
+   movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
+   # get the appropriate mask to mask out bottom r13 bytes of xmm9
+   pand%xmm1, %xmm9# mask out bottom r13 bytes of xmm9
+
+   pand%xmm1, %xmm3
+   movdqa  SHUF_MASK(%rip), %xmm10
+   PSHUFB_XMM  %xmm10, %xmm3
+   PSHUFB_XMM  %xmm2, %xmm3
+   pxor%xmm3, \AAD_HASH
+
+   cmp $0, %r10
+   jl  _partial_incomplete_1_\@
+
+   # GHASH computation for the last <16 Byte block
+   GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+   xor %rax,%rax
+
+   mov %rax, PBlockLen(%arg2)
+   jmp _dec_done_\@
+_partial_incomplete_1_\@:
+   add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
+_dec_done_\@:
+   movdqu  \AAD_HASH, AadHash(%arg2)
+.else
+   pxor%xmm1, %xmm9# Plaintext XOR E(K, Yn)
+
+   mov \PLAIN_CYPH_LEN, %r10
+   add %r13, %r10
+   # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+   sub $16, %r10
+   # Determine if if partial block is not being filled and
+   # shift mask accordingly
+   jge _no_extra_mask_2_\@
+   sub %r10, %r12
+_no_extra_mask_2_\@:
+
+   movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
+   # get the appropriate mask to mask out bottom r13 bytes of xmm9
+   pand%xmm1, %xmm9
+
+   movdqa  SHUF_MASK(%rip), %xmm1
+   PSHUFB_XMM %xmm1, %xmm9
+   PSHUFB_XMM %xmm2, %xmm9
+   pxor%xmm9, \AAD_HASH
+
+   cmp $0, %r10
+   jl  _partial_incomplete_2_\@
+
+   # GHASH computation for the last <16 Byte block
+   GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6

[PATCH v2 10/14] x86/crypto: aesni: Move HashKey computation from stack to gcm_context

2018-02-14 Thread Dave Watson
HashKey computation only needs to happen once per scatter/gather operation,
save it between calls in gcm_context struct instead of on the stack.
Since the asm no longer stores anything on the stack, we can use
%rsp directly, and clean up the frame save/restore macros a bit.

Hashkeys actually only need to be calculated once per key and could
be moved to when set_key is called, however, the current glue code
falls back to generic aes code if fpu is disabled.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 205 --
 1 file changed, 106 insertions(+), 99 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 37b1cee..3ada06b 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -93,23 +93,6 @@ ALL_F:  .octa 0x
 
 
 #defineSTACK_OFFSET8*3
-#defineHashKey 16*0// store HashKey <<1 mod poly here
-#defineHashKey_2   16*1// store HashKey^2 <<1 mod poly here
-#defineHashKey_3   16*2// store HashKey^3 <<1 mod poly here
-#defineHashKey_4   16*3// store HashKey^4 <<1 mod poly here
-#defineHashKey_k   16*4// store XOR of High 64 bits and Low 64
-   // bits of  HashKey <<1 mod poly here
-   //(for Karatsuba purposes)
-#defineHashKey_2_k 16*5// store XOR of High 64 bits and Low 64
-   // bits of  HashKey^2 <<1 mod poly here
-   // (for Karatsuba purposes)
-#defineHashKey_3_k 16*6// store XOR of High 64 bits and Low 64
-   // bits of  HashKey^3 <<1 mod poly here
-   // (for Karatsuba purposes)
-#defineHashKey_4_k 16*7// store XOR of High 64 bits and Low 64
-   // bits of  HashKey^4 <<1 mod poly here
-   // (for Karatsuba purposes)
-#defineVARIABLE_OFFSET 16*8
 
 #define AadHash 16*0
 #define AadLen 16*1
@@ -118,6 +101,22 @@ ALL_F:  .octa 0x
 #define OrigIV 16*3
 #define CurCount 16*4
 #define PBlockLen 16*5
+#defineHashKey 16*6// store HashKey <<1 mod poly here
+#defineHashKey_2   16*7// store HashKey^2 <<1 mod poly here
+#defineHashKey_3   16*8// store HashKey^3 <<1 mod poly here
+#defineHashKey_4   16*9// store HashKey^4 <<1 mod poly here
+#defineHashKey_k   16*10   // store XOR of High 64 bits and Low 64
+   // bits of  HashKey <<1 mod poly here
+   //(for Karatsuba purposes)
+#defineHashKey_2_k 16*11   // store XOR of High 64 bits and Low 64
+   // bits of  HashKey^2 <<1 mod poly here
+   // (for Karatsuba purposes)
+#defineHashKey_3_k 16*12   // store XOR of High 64 bits and Low 64
+   // bits of  HashKey^3 <<1 mod poly here
+   // (for Karatsuba purposes)
+#defineHashKey_4_k 16*13   // store XOR of High 64 bits and Low 64
+   // bits of  HashKey^4 <<1 mod poly here
+   // (for Karatsuba purposes)
 
 #define arg1 rdi
 #define arg2 rsi
@@ -125,11 +124,11 @@ ALL_F:  .octa 0x
 #define arg4 rcx
 #define arg5 r8
 #define arg6 r9
-#define arg7 STACK_OFFSET+8(%r14)
-#define arg8 STACK_OFFSET+16(%r14)
-#define arg9 STACK_OFFSET+24(%r14)
-#define arg10 STACK_OFFSET+32(%r14)
-#define arg11 STACK_OFFSET+40(%r14)
+#define arg7 STACK_OFFSET+8(%rsp)
+#define arg8 STACK_OFFSET+16(%rsp)
+#define arg9 STACK_OFFSET+24(%rsp)
+#define arg10 STACK_OFFSET+32(%rsp)
+#define arg11 STACK_OFFSET+40(%rsp)
 #define keysize 2*15*16(%arg1)
 #endif
 
@@ -183,28 +182,79 @@ ALL_F:  .octa 0x
push%r12
push%r13
push%r14
-   mov %rsp, %r14
 #
 # states of %xmm registers %xmm6:%xmm15 not saved
 # all %xmm registers are clobbered
 #
-   sub $VARIABLE_OFFSET, %rsp
-   and $~63, %rsp
 .endm
 
 
 .macro FUNC_RESTORE
-   mov %r14, %rsp
pop %r14
pop %r13
pop %r12
 .endm
 
+# Precompute hashkeys.
+# Input: Hash subkey.
+# Output: HashKeys stored in gcm_context_data.  Only needs to be called
+# once per key.
+# clobbers r12, and tmp xmm registers.
+.macro PRECOMPUTE TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
+   mov arg7, %r12
+   movdqu  (%r12), \TMP3
+   movdqa  SHUF_MASK(%rip), \TMP2
+   PSHUFB_XMM \TMP2, \TMP3
+
+   # precompute HashK

[PATCH v2 01/14] x86/crypto: aesni: Merge INITIAL_BLOCKS_ENC/DEC

2018-02-14 Thread Dave Watson
Use macro operations to merge implemetations of INITIAL_BLOCKS,
since they differ by only a small handful of lines.

Use macro counter \@ to simplify implementation.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 298 ++
 1 file changed, 48 insertions(+), 250 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 76d8cd4..48911fe 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -275,234 +275,7 @@ _done_read_partial_block_\@:
 */
 
 
-.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 
XMM1 \
-XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
-MOVADQ SHUF_MASK(%rip), %xmm14
-   movarg7, %r10   # %r10 = AAD
-   movarg8, %r11   # %r11 = aadLen
-   pxor   %xmm\i, %xmm\i
-   pxor   \XMM2, \XMM2
-
-   cmp$16, %r11
-   jl _get_AAD_rest\num_initial_blocks\operation
-_get_AAD_blocks\num_initial_blocks\operation:
-   movdqu (%r10), %xmm\i
-   PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
-   pxor   %xmm\i, \XMM2
-   GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-   add$16, %r10
-   sub$16, %r11
-   cmp$16, %r11
-   jge_get_AAD_blocks\num_initial_blocks\operation
-
-   movdqu \XMM2, %xmm\i
-
-   /* read the last <16B of AAD */
-_get_AAD_rest\num_initial_blocks\operation:
-   cmp$0, %r11
-   je _get_AAD_done\num_initial_blocks\operation
-
-   READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
-   PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
-   pxor   \XMM2, %xmm\i
-   GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-
-_get_AAD_done\num_initial_blocks\operation:
-   xor%r11, %r11 # initialise the data pointer offset as zero
-   # start AES for num_initial_blocks blocks
-
-   mov%arg5, %rax  # %rax = *Y0
-   movdqu (%rax), \XMM0# XMM0 = Y0
-   PSHUFB_XMM   %xmm14, \XMM0
-
-.if (\i == 5) || (\i == 6) || (\i == 7)
-   MOVADQ  ONE(%RIP),\TMP1
-   MOVADQ  (%arg1),\TMP2
-.irpc index, \i_seq
-   paddd  \TMP1, \XMM0 # INCR Y0
-   movdqa \XMM0, %xmm\index
-   PSHUFB_XMM   %xmm14, %xmm\index  # perform a 16 byte swap
-   pxor   \TMP2, %xmm\index
-.endr
-   lea 0x10(%arg1),%r10
-   mov keysize,%eax
-   shr $2,%eax # 128->4, 192->6, 256->8
-   add $5,%eax   # 128->9, 192->11, 256->13
-
-aes_loop_initial_dec\num_initial_blocks:
-   MOVADQ  (%r10),\TMP1
-.irpc  index, \i_seq
-   AESENC  \TMP1, %xmm\index
-.endr
-   add $16,%r10
-   sub $1,%eax
-   jnz aes_loop_initial_dec\num_initial_blocks
-
-   MOVADQ  (%r10), \TMP1
-.irpc index, \i_seq
-   AESENCLAST \TMP1, %xmm\index # Last Round
-.endr
-.irpc index, \i_seq
-   movdqu (%arg3 , %r11, 1), \TMP1
-   pxor   \TMP1, %xmm\index
-   movdqu %xmm\index, (%arg2 , %r11, 1)
-   # write back plaintext/ciphertext for num_initial_blocks
-   add$16, %r11
-
-   movdqa \TMP1, %xmm\index
-   PSHUFB_XMM %xmm14, %xmm\index
-# prepare plaintext/ciphertext for GHASH computation
-.endr
-.endif
-
-# apply GHASH on num_initial_blocks blocks
-
-.if \i == 5
-pxor   %xmm5, %xmm6
-   GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-pxor   %xmm6, %xmm7
-   GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-pxor   %xmm7, %xmm8
-   GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 6
-pxor   %xmm6, %xmm7
-   GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-pxor   %xmm7, %xmm8
-   GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 7
-pxor   %xmm7, %xmm8
-   GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.endif
-   cmp$64, %r13
-   jl  _initial_blocks_done\num_initial_blocks\operation
-   # no need for precomputed values
-/*
-*
-* Precomputations for HashKey parallel with encryption of first 4 blocks.
-* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-*/
-   MOVADQ ONE(%rip), \TMP1
-   paddd  \TMP1, \XMM0  # INCR Y0
-   MOVADQ \XMM0, \XMM1
-   PSHUFB_XMM  %xmm14, \XMM1# perform a 16 byte swap
-
-   paddd  \TMP1, \XMM0  # INCR Y0
-   MOVADQ \XMM0, \XMM2
-   PSHUFB_XMM  %xmm14, \XMM2# perform a 16 byte swap
-
-   paddd  \TMP1, \XMM0  # INCR Y0
-   

[PATCH v2 02/14] x86/crypto: aesni: Macro-ify func save/restore

2018-02-14 Thread Dave Watson
Macro-ify function save and restore.  These will be used in new functions
added for scatter/gather update operations.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 53 ++-
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 48911fe..39b42b1 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -170,6 +170,26 @@ ALL_F:  .octa 0x
 #define TKEYP  T1
 #endif
 
+.macro FUNC_SAVE
+   push%r12
+   push%r13
+   push%r14
+   mov %rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+   sub $VARIABLE_OFFSET, %rsp
+   and $~63, %rsp
+.endm
+
+
+.macro FUNC_RESTORE
+   mov %r14, %rsp
+   pop %r14
+   pop %r13
+   pop %r12
+.endm
 
 #ifdef __x86_64__
 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
@@ -1130,16 +1150,7 @@ _esb_loop_\@:
 *
 */
 ENTRY(aesni_gcm_dec)
-   push%r12
-   push%r13
-   push%r14
-   mov %rsp, %r14
-/*
-* states of %xmm registers %xmm6:%xmm15 not saved
-* all %xmm registers are clobbered
-*/
-   sub $VARIABLE_OFFSET, %rsp
-   and $~63, %rsp# align rsp to 64 bytes
+   FUNC_SAVE
mov %arg6, %r12
movdqu  (%r12), %xmm13# %xmm13 = HashKey
 movdqa  SHUF_MASK(%rip), %xmm2
@@ -1309,10 +1320,7 @@ _T_1_decrypt:
 _T_16_decrypt:
movdqu  %xmm0, (%r10)
 _return_T_done_decrypt:
-   mov %r14, %rsp
-   pop %r14
-   pop %r13
-   pop %r12
+   FUNC_RESTORE
ret
 ENDPROC(aesni_gcm_dec)
 
@@ -1393,22 +1401,12 @@ ENDPROC(aesni_gcm_dec)
 * poly = x^128 + x^127 + x^126 + x^121 + 1
 ***/
 ENTRY(aesni_gcm_enc)
-   push%r12
-   push%r13
-   push%r14
-   mov %rsp, %r14
-#
-# states of %xmm registers %xmm6:%xmm15 not saved
-# all %xmm registers are clobbered
-#
-   sub $VARIABLE_OFFSET, %rsp
-   and $~63, %rsp
+   FUNC_SAVE
mov %arg6, %r12
movdqu  (%r12), %xmm13
 movdqa  SHUF_MASK(%rip), %xmm2
PSHUFB_XMM %xmm2, %xmm13
 
-
 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 
movdqa  %xmm13, %xmm2
@@ -1576,10 +1574,7 @@ _T_1_encrypt:
 _T_16_encrypt:
movdqu  %xmm0, (%r10)
 _return_T_done_encrypt:
-   mov %r14, %rsp
-   pop %r14
-   pop %r13
-   pop %r12
+   FUNC_RESTORE
ret
 ENDPROC(aesni_gcm_enc)
 
-- 
2.9.5



[PATCH v2 00/14] x86/crypto gcmaes SSE scatter/gather support

2018-02-14 Thread Dave Watson
This patch set refactors the x86 aes/gcm SSE crypto routines to
support true scatter/gather by adding gcm_enc/dec_update methods.

The layout is:

* First 5 patches refactor the code to use macros, so changes only
  need to be applied once for encode and decode.  There should be no
  functional changes.

* The next 6 patches introduce a gcm_context structure to be passed
  between scatter/gather calls to maintain state.  The struct is also
  used as scratch space for the existing enc/dec routines.

* The last 2 set up the asm function entry points for scatter gather
  support, and then call the new routines per buffer in the passed in
  sglist in aesni-intel_glue.

Testing: 
asm itself fuzz tested vs. existing code and isa-l asm.
Ran libkcapi test suite, passes.

perf of a large (16k messages) TLS sends sg vs. no sg:

no-sg

33287255597  cycles  
53702871176  instructions

43.47%   _crypt_by_4
17.83%   memcpy
16.36%   aes_loop_par_enc_done

sg

27568944591  cycles 
54580446678  instructions

49.87%   _crypt_by_4
17.40%   aes_loop_par_enc_done
1.79%aes_loop_initial_5416
1.52%aes_loop_initial_4974
1.27%gcmaes_encrypt_sg.constprop.15

V1 -> V2:

patch 14: merge enc/dec
  also use new routine if cryptlen < AVX_GEN2_OPTSIZE
  optimize case if assoc is already linear

Dave Watson (14):
  x86/crypto: aesni: Merge INITIAL_BLOCKS_ENC/DEC
  x86/crypto: aesni: Macro-ify func save/restore
  x86/crypto: aesni: Add GCM_INIT macro
  x86/crypto: aesni: Add GCM_COMPLETE macro
  x86/crypto: aesni: Merge encode and decode to GCM_ENC_DEC macro
  x86/crypto: aesni: Introduce gcm_context_data
  x86/crypto: aesni: Split AAD hash calculation to separate macro
  x86/crypto: aesni: Fill in new context data structures
  x86/crypto: aesni: Move ghash_mul to GCM_COMPLETE
  x86/crypto: aesni: Move HashKey computation from stack to gcm_context
  x86/crypto: aesni: Introduce partial block macro
  x86/crypto: aesni: Add fast path for > 16 byte update
  x86/crypto: aesni: Introduce scatter/gather asm function stubs
  x86/crypto: aesni: Update aesni-intel_glue to use scatter/gather

 arch/x86/crypto/aesni-intel_asm.S  | 1414 ++--
 arch/x86/crypto/aesni-intel_glue.c |  230 +-
 2 files changed, 899 insertions(+), 745 deletions(-)

-- 
2.9.5



Re: [PATCH 14/14] x86/crypto: aesni: Update aesni-intel_glue to use scatter/gather

2018-02-13 Thread Dave Watson
On 02/13/18 08:42 AM, Stephan Mueller wrote:
> > +static int gcmaes_encrypt_sg(struct aead_request *req, unsigned int
> > assoclen, + u8 *hash_subkey, u8 *iv, void *aes_ctx)
> > +{
> > +   struct crypto_aead *tfm = crypto_aead_reqtfm(req);
> > +   unsigned long auth_tag_len = crypto_aead_authsize(tfm);
> > +   struct gcm_context_data data AESNI_ALIGN_ATTR;
> > +   struct scatter_walk dst_sg_walk = {};
> > +   unsigned long left = req->cryptlen;
> > +   unsigned long len, srclen, dstlen;
> > +   struct scatter_walk src_sg_walk;
> > +   struct scatterlist src_start[2];
> > +   struct scatterlist dst_start[2];
> > +   struct scatterlist *src_sg;
> > +   struct scatterlist *dst_sg;
> > +   u8 *src, *dst, *assoc;
> > +   u8 authTag[16];
> > +
> > +   assoc = kmalloc(assoclen, GFP_ATOMIC);
> > +   if (unlikely(!assoc))
> > +   return -ENOMEM;
> > +   scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
> 
> Have you tested that this code does not barf when assoclen is 0?
> 
> Maybe it is worth while to finally add a test vector to testmgr.h which 
> validates such scenario. If you would like, here is a vector you could add to 
> testmgr:
> 
> https://github.com/smuellerDD/libkcapi/blob/master/test/test.sh#L315

I tested assoclen and cryptlen being 0 and it works, yes.  Both
kmalloc and scatterwalk_map_and_copy work correctly with 0 assoclen.

> This is a decryption of gcm(aes) with no message, no AAD and just a tag. The 
> result should be EBADMSG.
> > +
> > +   src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
> 
> Why do you use assoclen in the map_and_copy, and req->assoclen in the ffwd?

If I understand correctly, rfc4106 appends extra data after the assoc.
assoclen is the real assoc length, req->assoclen is assoclen + the
extra data length.  So we ffwd by req->assoclen in the scatterlist,
but use assoclen when memcpy and testing.

> > 
> > +static int gcmaes_decrypt_sg(struct aead_request *req, unsigned int
> > assoclen, + u8 *hash_subkey, u8 *iv, void *aes_ctx)
> > +{
> 
> This is a lot of code duplication.

I will merge them and send a V2.

> Ciao
> Stephan
> 
> 

Thanks!


Re: [PATCH 14/14] x86/crypto: aesni: Update aesni-intel_glue to use scatter/gather

2018-02-13 Thread Dave Watson
On 02/12/18 03:12 PM, Junaid Shahid wrote:
> Hi Dave,
> 
> 
> On 02/12/2018 11:51 AM, Dave Watson wrote:
> 
> > +static int gcmaes_encrypt_sg(struct aead_request *req, unsigned int 
> > assoclen,
> > +   u8 *hash_subkey, u8 *iv, void *aes_ctx)
> >  
> > +static int gcmaes_decrypt_sg(struct aead_request *req, unsigned int 
> > assoclen,
> > +   u8 *hash_subkey, u8 *iv, void *aes_ctx)
> 
> These two functions are almost identical. Wouldn't it be better to combine 
> them into a single encrypt/decrypt function, similar to what you have done 
> for the assembly macros?
> 
> > +   if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
> > +   aesni_gcm_enc_tfm == aesni_gcm_enc) {
> 
> Shouldn't we also include a check for the buffer length being less than 
> AVX_GEN2_OPTSIZE? AVX will not be used in that case either.

Yes, these both sound reasonable.  I will send a V2.

Thanks!


[PATCH 06/14] x86/crypto: aesni: Introduce gcm_context_data

2018-02-12 Thread Dave Watson
Introduce a gcm_context_data struct that will be used to pass
context data between scatter/gather update calls.  It is passed
as the second argument (after crypto keys), other args are
renumbered.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S  | 115 +
 arch/x86/crypto/aesni-intel_glue.c |  81 ++
 2 files changed, 121 insertions(+), 75 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 8021fd1..6c5a80d 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -111,6 +111,14 @@ ALL_F:  .octa 0x
// (for Karatsuba purposes)
 #defineVARIABLE_OFFSET 16*8
 
+#define AadHash 16*0
+#define AadLen 16*1
+#define InLen (16*1)+8
+#define PBlockEncKey 16*2
+#define OrigIV 16*3
+#define CurCount 16*4
+#define PBlockLen 16*5
+
 #define arg1 rdi
 #define arg2 rsi
 #define arg3 rdx
@@ -121,6 +129,7 @@ ALL_F:  .octa 0x
 #define arg8 STACK_OFFSET+16(%r14)
 #define arg9 STACK_OFFSET+24(%r14)
 #define arg10 STACK_OFFSET+32(%r14)
+#define arg11 STACK_OFFSET+40(%r14)
 #define keysize 2*15*16(%arg1)
 #endif
 
@@ -195,9 +204,9 @@ ALL_F:  .octa 0x
 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 .macro GCM_INIT
-   mov %arg6, %r12
+   mov arg7, %r12
movdqu  (%r12), %xmm13
-   movdqa  SHUF_MASK(%rip), %xmm2
+   movdqa  SHUF_MASK(%rip), %xmm2
PSHUFB_XMM %xmm2, %xmm13
 
# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
@@ -217,7 +226,7 @@ ALL_F:  .octa 0x
pandPOLY(%rip), %xmm2
pxor%xmm2, %xmm13
movdqa  %xmm13, HashKey(%rsp)
-   mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod 
poly)
+   mov %arg5, %r13 # %xmm13 holds HashKey<<1 (mod poly)
and $-16, %r13
mov %r13, %r12
 .endm
@@ -271,18 +280,18 @@ _four_cipher_left_\@:
GHASH_LAST_4%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 _zero_cipher_left_\@:
-   mov %arg4, %r13
-   and $15, %r13   # %r13 = arg4 (mod 16)
+   mov %arg5, %r13
+   and $15, %r13   # %r13 = arg5 (mod 16)
je  _multiple_of_16_bytes_\@
 
# Handle the last <16 Byte block separately
paddd ONE(%rip), %xmm0# INCR CNT to get Yn
-movdqa SHUF_MASK(%rip), %xmm10
+   movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm0
 
ENCRYPT_SINGLE_BLOCK%xmm0, %xmm1# Encrypt(K, Yn)
 
-   lea (%arg3,%r11,1), %r10
+   lea (%arg4,%r11,1), %r10
mov %r13, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 
@@ -320,13 +329,13 @@ _zero_cipher_left_\@:
MOVQ_R64_XMM %xmm0, %rax
cmp $8, %r13
jle _less_than_8_bytes_left_\@
-   mov %rax, (%arg2 , %r11, 1)
+   mov %rax, (%arg3 , %r11, 1)
add $8, %r11
psrldq $8, %xmm0
MOVQ_R64_XMM %xmm0, %rax
sub $8, %r13
 _less_than_8_bytes_left_\@:
-   mov %al,  (%arg2, %r11, 1)
+   mov %al,  (%arg3, %r11, 1)
add $1, %r11
shr $8, %rax
sub $1, %r13
@@ -338,11 +347,11 @@ _multiple_of_16_bytes_\@:
 # Output: Authorization Tag (AUTH_TAG)
 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 .macro GCM_COMPLETE
-   mov arg8, %r12# %r13 = aadLen (number of bytes)
+   mov arg9, %r12# %r13 = aadLen (number of bytes)
shl $3, %r12  # convert into number of bits
movd%r12d, %xmm15 # len(A) in %xmm15
-   shl $3, %arg4 # len(C) in bits (*128)
-   MOVQ_R64_XMM%arg4, %xmm1
+   shl $3, %arg5 # len(C) in bits (*128)
+   MOVQ_R64_XMM%arg5, %xmm1
pslldq  $8, %xmm15# %xmm15 = len(A)||0x
pxor%xmm1, %xmm15 # %xmm15 = len(A)||len(C)
pxor%xmm15, %xmm8
@@ -351,13 +360,13 @@ _multiple_of_16_bytes_\@:
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm8
 
-   mov %arg5, %rax   # %rax = *Y0
+   mov %arg6, %rax   # %rax = *Y0
movdqu  (%rax), %xmm0 # %xmm0 = Y0
ENCRYPT_SINGLE_BLOCK%xmm0,  %xmm1 # E(K, Y0)
pxor%xmm8, %xmm0
 _return_T_\@:
-   mov arg9, %r10 # %r10 = authTag
-   mov arg10, %r11# %r11 = auth_tag_len
+   mov arg10, %r10 # %r10 = authTag

[PATCH 14/14] x86/crypto: aesni: Update aesni-intel_glue to use scatter/gather

2018-02-12 Thread Dave Watson
Add gcmaes_en/decrypt_sg routines, that will do scatter/gather
by sg. Either src or dst may contain multiple buffers, so
iterate over both at the same time if they are different.
If the input is the same as the output, iterate only over one.

Currently both the AAD and TAG must be linear, so copy them out
with scatterlist_map_and_copy. 

Only the SSE routines are updated so far, so leave the previous
gcmaes_en/decrypt routines, and branch to the sg ones if the
keysize is inappropriate for avx, or we are SSE only.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_glue.c | 166 +
 1 file changed, 166 insertions(+)

diff --git a/arch/x86/crypto/aesni-intel_glue.c 
b/arch/x86/crypto/aesni-intel_glue.c
index de986f9..1e32fbe 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -791,6 +791,82 @@ static int generic_gcmaes_set_authsize(struct crypto_aead 
*tfm,
return 0;
 }
 
+static int gcmaes_encrypt_sg(struct aead_request *req, unsigned int assoclen,
+   u8 *hash_subkey, u8 *iv, void *aes_ctx)
+{
+   struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+   unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+   struct gcm_context_data data AESNI_ALIGN_ATTR;
+   struct scatter_walk dst_sg_walk = {};
+   unsigned long left = req->cryptlen;
+   unsigned long len, srclen, dstlen;
+   struct scatter_walk src_sg_walk;
+   struct scatterlist src_start[2];
+   struct scatterlist dst_start[2];
+   struct scatterlist *src_sg;
+   struct scatterlist *dst_sg;
+   u8 *src, *dst, *assoc;
+   u8 authTag[16];
+
+   assoc = kmalloc(assoclen, GFP_ATOMIC);
+   if (unlikely(!assoc))
+   return -ENOMEM;
+   scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
+
+   src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
+   scatterwalk_start(_sg_walk, src_sg);
+   if (req->src != req->dst) {
+   dst_sg = scatterwalk_ffwd(dst_start, req->dst, req->assoclen);
+   scatterwalk_start(_sg_walk, dst_sg);
+   }
+
+   kernel_fpu_begin();
+   aesni_gcm_init(aes_ctx, , iv,
+   hash_subkey, assoc, assoclen);
+   if (req->src != req->dst) {
+   while (left) {
+   src = scatterwalk_map(_sg_walk);
+   dst = scatterwalk_map(_sg_walk);
+   srclen = scatterwalk_clamp(_sg_walk, left);
+   dstlen = scatterwalk_clamp(_sg_walk, left);
+   len = min(srclen, dstlen);
+   if (len)
+   aesni_gcm_enc_update(aes_ctx, ,
+dst, src, len);
+   left -= len;
+
+   scatterwalk_unmap(src);
+   scatterwalk_unmap(dst);
+   scatterwalk_advance(_sg_walk, len);
+   scatterwalk_advance(_sg_walk, len);
+   scatterwalk_done(_sg_walk, 0, left);
+   scatterwalk_done(_sg_walk, 1, left);
+   }
+   } else {
+   while (left) {
+   dst = src = scatterwalk_map(_sg_walk);
+   len = scatterwalk_clamp(_sg_walk, left);
+   if (len)
+   aesni_gcm_enc_update(aes_ctx, ,
+src, src, len);
+   left -= len;
+   scatterwalk_unmap(src);
+   scatterwalk_advance(_sg_walk, len);
+   scatterwalk_done(_sg_walk, 1, left);
+   }
+   }
+   aesni_gcm_finalize(aes_ctx, , authTag, auth_tag_len);
+   kernel_fpu_end();
+
+   kfree(assoc);
+
+   /* Copy in the authTag */
+   scatterwalk_map_and_copy(authTag, req->dst,
+   req->assoclen + req->cryptlen,
+   auth_tag_len, 1);
+   return 0;
+}
+
 static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
  u8 *hash_subkey, u8 *iv, void *aes_ctx)
 {
@@ -802,6 +878,11 @@ static int gcmaes_encrypt(struct aead_request *req, 
unsigned int assoclen,
struct scatter_walk dst_sg_walk = {};
struct gcm_context_data data AESNI_ALIGN_ATTR;
 
+   if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
+   aesni_gcm_enc_tfm == aesni_gcm_enc) {
+   return gcmaes_encrypt_sg(req, assoclen, hash_subkey, iv,
+   aes_ctx);
+   }
if (sg_is_last(req->src) &&
(!PageHighMem(sg_page(req->src)) ||
req->src->offset + req->src->length <= PAGE_SIZE) &&
@@ -854,6 

[PATCH 13/14] x86/crypto: aesni: Introduce scatter/gather asm function stubs

2018-02-12 Thread Dave Watson
The asm macros are all set up now, introduce entry points.

GCM_INIT and GCM_COMPLETE have arguments supplied, so that
the new scatter/gather entry points don't have to take all the
arguments, and only the ones they need.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S  | 116 -
 arch/x86/crypto/aesni-intel_glue.c |  16 +
 2 files changed, 106 insertions(+), 26 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index b941952..311b2de 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -200,8 +200,8 @@ ALL_F:  .octa 0x
 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
 # once per key.
 # clobbers r12, and tmp xmm registers.
-.macro PRECOMPUTE TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
-   mov arg7, %r12
+.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
+   mov \SUBKEY, %r12
movdqu  (%r12), \TMP3
movdqa  SHUF_MASK(%rip), \TMP2
PSHUFB_XMM \TMP2, \TMP3
@@ -254,14 +254,14 @@ ALL_F:  .octa 0x
 
 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
-.macro GCM_INIT
-   mov arg9, %r11
+.macro GCM_INIT Iv SUBKEY AAD AADLEN
+   mov \AADLEN, %r11
mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
xor %r11, %r11
mov %r11, InLen(%arg2) # ctx_data.in_length = 0
mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
-   mov %arg6, %rax
+   mov \Iv, %rax
movdqu (%rax), %xmm0
movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 
@@ -269,11 +269,11 @@ ALL_F:  .octa 0x
PSHUFB_XMM %xmm2, %xmm0
movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 
-   PRECOMPUTE %xmm1 %xmm2 %xmm3 %xmm4 %xmm5 %xmm6 %xmm7
+   PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
movdqa HashKey(%arg2), %xmm13
 
-   CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
-   %xmm5 %xmm6
+   CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
+   %xmm4, %xmm5, %xmm6
 .endm
 
 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
@@ -435,7 +435,7 @@ _multiple_of_16_bytes_\@:
 # GCM_COMPLETE Finishes update of tag of last partial block
 # Output: Authorization Tag (AUTH_TAG)
 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
-.macro GCM_COMPLETE
+.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
movdqu AadHash(%arg2), %xmm8
movdqu HashKey(%arg2), %xmm13
 
@@ -466,8 +466,8 @@ _partial_done\@:
ENCRYPT_SINGLE_BLOCK%xmm0,  %xmm1 # E(K, Y0)
pxor%xmm8, %xmm0
 _return_T_\@:
-   mov arg10, %r10 # %r10 = authTag
-   mov arg11, %r11# %r11 = auth_tag_len
+   mov \AUTHTAG, %r10 # %r10 = authTag
+   mov \AUTHTAGLEN, %r11# %r11 = auth_tag_len
cmp $16, %r11
je  _T_16_\@
cmp $8, %r11
@@ -599,11 +599,11 @@ _done_read_partial_block_\@:
 
 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 # clobbers r10-11, xmm14
-.macro CALC_AAD_HASH HASHKEY TMP1 TMP2 TMP3 TMP4 TMP5 \
+.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 TMP7
MOVADQ SHUF_MASK(%rip), %xmm14
-   movarg8, %r10   # %r10 = AAD
-   movarg9, %r11   # %r11 = aadLen
+   mov\AAD, %r10   # %r10 = AAD
+   mov\AADLEN, %r11# %r11 = aadLen
pxor   \TMP7, \TMP7
pxor   \TMP6, \TMP6
 
@@ -1103,18 +1103,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 
operation
mov   keysize,%eax
shr   $2,%eax   # 128->4, 192->6, 256->8
sub   $4,%eax   # 128->0, 192->2, 256->4
-   jzaes_loop_par_enc_done
+   jzaes_loop_par_enc_done\@
 
-aes_loop_par_enc:
+aes_loop_par_enc\@:
MOVADQ(%r10),\TMP3
 .irpc  index, 1234
AESENC\TMP3, %xmm\index
 .endr
add   $16,%r10
sub   $1,%eax
-   jnz   aes_loop_par_enc
+   jnz   aes_loop_par_enc\@
 
-aes_loop_par_enc_done:
+aes_loop_par_enc_done\@:
MOVADQ(%r10), \TMP3
AESENCLAST \TMP3, \XMM1   # Round 10
AESENCLAST \TMP3, \XMM2
@@ -1311,18 +1311,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 
operation
mov   keysize,%eax
shr   $2,%eax   # 128->4, 192->6, 256->8
sub   $4,%eax  

[PATCH 05/14] x86/crypto: aesni: Merge encode and decode to GCM_ENC_DEC macro

2018-02-12 Thread Dave Watson
Make a macro for the main encode/decode routine.  Only a small handful
of lines differ for enc and dec.   This will also become the main
scatter/gather update routine.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 293 +++---
 1 file changed, 114 insertions(+), 179 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 529c542..8021fd1 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -222,6 +222,118 @@ ALL_F:  .octa 0x
mov %r13, %r12
 .endm
 
+# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
+# struct has been initialized by GCM_INIT.
+# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
+# Clobbers rax, r10-r13, and xmm0-xmm15
+.macro GCM_ENC_DEC operation
+   # Encrypt/Decrypt first few blocks
+
+   and $(3<<4), %r12
+   jz  _initial_num_blocks_is_0_\@
+   cmp $(2<<4), %r12
+   jb  _initial_num_blocks_is_1_\@
+   je  _initial_num_blocks_is_2_\@
+_initial_num_blocks_is_3_\@:
+   INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
+   sub $48, %r13
+   jmp _initial_blocks_\@
+_initial_num_blocks_is_2_\@:
+   INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
+   sub $32, %r13
+   jmp _initial_blocks_\@
+_initial_num_blocks_is_1_\@:
+   INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
+   sub $16, %r13
+   jmp _initial_blocks_\@
+_initial_num_blocks_is_0_\@:
+   INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
+_initial_blocks_\@:
+
+   # Main loop - Encrypt/Decrypt remaining blocks
+
+   cmp $0, %r13
+   je  _zero_cipher_left_\@
+   sub $64, %r13
+   je  _four_cipher_left_\@
+_crypt_by_4_\@:
+   GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
+   %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
+   %xmm7, %xmm8, enc
+   add $64, %r11
+   sub $64, %r13
+   jne _crypt_by_4_\@
+_four_cipher_left_\@:
+   GHASH_LAST_4%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_\@:
+   mov %arg4, %r13
+   and $15, %r13   # %r13 = arg4 (mod 16)
+   je  _multiple_of_16_bytes_\@
+
+   # Handle the last <16 Byte block separately
+   paddd ONE(%rip), %xmm0# INCR CNT to get Yn
+movdqa SHUF_MASK(%rip), %xmm10
+   PSHUFB_XMM %xmm10, %xmm0
+
+   ENCRYPT_SINGLE_BLOCK%xmm0, %xmm1# Encrypt(K, Yn)
+
+   lea (%arg3,%r11,1), %r10
+   mov %r13, %r12
+   READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
+
+   lea ALL_F+16(%rip), %r12
+   sub %r13, %r12
+.ifc \operation, dec
+   movdqa  %xmm1, %xmm2
+.endif
+   pxor%xmm1, %xmm0# XOR Encrypt(K, Yn)
+   movdqu  (%r12), %xmm1
+   # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+   pand%xmm1, %xmm0# mask out top 16-r13 bytes of xmm0
+.ifc \operation, dec
+   pand%xmm1, %xmm2
+   movdqa SHUF_MASK(%rip), %xmm10
+   PSHUFB_XMM %xmm10 ,%xmm2
+
+   pxor %xmm2, %xmm8
+.else
+   movdqa SHUF_MASK(%rip), %xmm10
+   PSHUFB_XMM %xmm10,%xmm0
+
+   pxor%xmm0, %xmm8
+.endif
+
+   GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+.ifc \operation, enc
+   # GHASH computation for the last <16 byte block
+   movdqa SHUF_MASK(%rip), %xmm10
+   # shuffle xmm0 back to output as ciphertext
+   PSHUFB_XMM %xmm10, %xmm0
+.endif
+
+   # Output %r13 bytes
+   MOVQ_R64_XMM %xmm0, %rax
+   cmp $8, %r13
+   jle _less_than_8_bytes_left_\@
+   mov %rax, (%arg2 , %r11, 1)
+   add $8, %r11
+   psrldq $8, %xmm0
+   MOVQ_R64_XMM %xmm0, %rax
+   sub $8, %r13
+_less_than_8_bytes_left_\@:
+   mov %al,  (%arg2, %r11, 1)
+   add $1, %r11
+   shr $8, %rax
+   sub $1, %r13
+   jne _less_than_8_bytes_left_\@
+_multiple_of_16_bytes_\@:
+.endm
+
 # GCM_COMPLETE Finishes update of tag of last partial block
 # Output: Authorization Tag (AUTH_TAG)
 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
@@ -1245,93 +1357,7 @@ ENTRY(aesni_gcm_dec)
FUNC_SAVE
 
GCM_INIT
-
-# Decrypt first few blocks
-
-   and $(3<<4), %r12
-   jz _initial_num_blocks_is_0_decrypt
- 

[PATCH 12/14] x86/crypto: aesni: Add fast path for > 16 byte update

2018-02-12 Thread Dave Watson
We can fast-path any < 16 byte read if the full message is > 16 bytes,
and shift over by the appropriate amount.  Usually we are
reading > 16 bytes, so this should be faster than the READ_PARTIAL
macro introduced in b20209c91e2 for the average case.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 25 +
 1 file changed, 25 insertions(+)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 398bd2237f..b941952 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -355,12 +355,37 @@ _zero_cipher_left_\@:
ENCRYPT_SINGLE_BLOCK%xmm0, %xmm1# Encrypt(K, Yn)
movdqu %xmm0, PBlockEncKey(%arg2)
 
+   cmp $16, %arg5
+   jge _large_enough_update_\@
+
lea (%arg4,%r11,1), %r10
mov %r13, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
+   jmp _data_read_\@
+
+_large_enough_update_\@:
+   sub $16, %r11
+   add %r13, %r11
+
+   # receive the last <16 Byte block
+   movdqu  (%arg4, %r11, 1), %xmm1
 
+   sub %r13, %r11
+   add $16, %r11
+
+   lea SHIFT_MASK+16(%rip), %r12
+   # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+   # (r13 is the number of bytes in plaintext mod 16)
+   sub %r13, %r12
+   # get the appropriate shuffle mask
+   movdqu  (%r12), %xmm2
+   # shift right 16-r13 bytes
+   PSHUFB_XMM  %xmm2, %xmm1
+
+_data_read_\@:
lea ALL_F+16(%rip), %r12
sub %r13, %r12
+
 .ifc \operation, dec
movdqa  %xmm1, %xmm2
 .endif
-- 
2.9.5



[PATCH 07/14] x86/crypto: aesni: Split AAD hash calculation to separate macro

2018-02-12 Thread Dave Watson
AAD hash only needs to be calculated once for each scatter/gather operation.
Move it to its own macro, and call it from GCM_INIT instead of
INITIAL_BLOCKS.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 71 ---
 1 file changed, 43 insertions(+), 28 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 6c5a80d..58bbfac 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -229,6 +229,10 @@ ALL_F:  .octa 0x
mov %arg5, %r13 # %xmm13 holds HashKey<<1 (mod poly)
and $-16, %r13
mov %r13, %r12
+
+   CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
+   %xmm5 %xmm6
+   mov %r13, %r12
 .endm
 
 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
@@ -496,51 +500,62 @@ _read_next_byte_lt8_\@:
 _done_read_partial_block_\@:
 .endm
 
-/*
-* if a = number of total plaintext bytes
-* b = floor(a/16)
-* num_initial_blocks = b mod 4
-* encrypt the initial num_initial_blocks blocks and apply ghash on
-* the ciphertext
-* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
-* are clobbered
-* arg1, %arg3, %arg4, %r14 are used as a pointer only, not modified
-*/
-
-
-.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
-XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
-MOVADQ SHUF_MASK(%rip), %xmm14
-   movarg8, %r10   # %r10 = AAD
-   movarg9, %r11   # %r11 = aadLen
-   pxor   %xmm\i, %xmm\i
-   pxor   \XMM2, \XMM2
+# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+# clobbers r10-11, xmm14
+.macro CALC_AAD_HASH HASHKEY TMP1 TMP2 TMP3 TMP4 TMP5 \
+   TMP6 TMP7
+   MOVADQ SHUF_MASK(%rip), %xmm14
+   movarg8, %r10   # %r10 = AAD
+   movarg9, %r11   # %r11 = aadLen
+   pxor   \TMP7, \TMP7
+   pxor   \TMP6, \TMP6
 
cmp$16, %r11
jl _get_AAD_rest\@
 _get_AAD_blocks\@:
-   movdqu (%r10), %xmm\i
-   PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
-   pxor   %xmm\i, \XMM2
-   GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+   movdqu (%r10), \TMP7
+   PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
+   pxor   \TMP7, \TMP6
+   GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
add$16, %r10
sub$16, %r11
cmp$16, %r11
jge_get_AAD_blocks\@
 
-   movdqu \XMM2, %xmm\i
+   movdqu \TMP6, \TMP7
 
/* read the last <16B of AAD */
 _get_AAD_rest\@:
cmp$0, %r11
je _get_AAD_done\@
 
-   READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
-   PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
-   pxor   \XMM2, %xmm\i
-   GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+   READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
+   PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
+   pxor   \TMP6, \TMP7
+   GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
+   movdqu \TMP7, \TMP6
 
 _get_AAD_done\@:
+   movdqu \TMP6, AadHash(%arg2)
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+   XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+
+   movdqu AadHash(%arg2), %xmm\i   # XMM0 = Y0
+
xor%r11, %r11 # initialise the data pointer offset as zero
# start AES for num_initial_blocks blocks
 
-- 
2.9.5



[PATCH 08/14] x86/crypto: aesni: Fill in new context data structures

2018-02-12 Thread Dave Watson
Fill in aadhash, aadlen, pblocklen, curcount with appropriate values.
pblocklen, aadhash, and pblockenckey are also updated at the end
of each scatter/gather operation, to be carried over to the next
operation.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 51 ++-
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 58bbfac..aa82493 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -204,6 +204,21 @@ ALL_F:  .octa 0x
 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 .macro GCM_INIT
+
+   mov arg9, %r11
+   mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
+   xor %r11, %r11
+   mov %r11, InLen(%arg2) # ctx_data.in_length = 0
+   mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
+   mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
+   mov %arg6, %rax
+   movdqu (%rax), %xmm0
+   movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
+
+   movdqa  SHUF_MASK(%rip), %xmm2
+   PSHUFB_XMM %xmm2, %xmm0
+   movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
+
mov arg7, %r12
movdqu  (%r12), %xmm13
movdqa  SHUF_MASK(%rip), %xmm2
@@ -226,13 +241,9 @@ ALL_F:  .octa 0x
pandPOLY(%rip), %xmm2
pxor%xmm2, %xmm13
movdqa  %xmm13, HashKey(%rsp)
-   mov %arg5, %r13 # %xmm13 holds HashKey<<1 (mod poly)
-   and $-16, %r13
-   mov %r13, %r12
 
CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
%xmm5 %xmm6
-   mov %r13, %r12
 .endm
 
 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
@@ -240,6 +251,12 @@ ALL_F:  .octa 0x
 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 # Clobbers rax, r10-r13, and xmm0-xmm15
 .macro GCM_ENC_DEC operation
+   movdqu AadHash(%arg2), %xmm8
+   movdqu HashKey(%rsp), %xmm13
+   add %arg5, InLen(%arg2)
+   mov %arg5, %r13 # save the number of bytes
+   and $-16, %r13  # %r13 = %r13 - (%r13 mod 16)
+   mov %r13, %r12
# Encrypt/Decrypt first few blocks
 
and $(3<<4), %r12
@@ -284,16 +301,23 @@ _four_cipher_left_\@:
GHASH_LAST_4%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 _zero_cipher_left_\@:
+   movdqu %xmm8, AadHash(%arg2)
+   movdqu %xmm0, CurCount(%arg2)
+
mov %arg5, %r13
and $15, %r13   # %r13 = arg5 (mod 16)
je  _multiple_of_16_bytes_\@
 
+   mov %r13, PBlockLen(%arg2)
+
# Handle the last <16 Byte block separately
paddd ONE(%rip), %xmm0# INCR CNT to get Yn
+   movdqu %xmm0, CurCount(%arg2)
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm0
 
ENCRYPT_SINGLE_BLOCK%xmm0, %xmm1# Encrypt(K, Yn)
+   movdqu %xmm0, PBlockEncKey(%arg2)
 
lea (%arg4,%r11,1), %r10
mov %r13, %r12
@@ -322,6 +346,7 @@ _zero_cipher_left_\@:
 .endif
 
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+   movdqu %xmm8, AadHash(%arg2)
 .ifc \operation, enc
# GHASH computation for the last <16 byte block
movdqa SHUF_MASK(%rip), %xmm10
@@ -351,11 +376,15 @@ _multiple_of_16_bytes_\@:
 # Output: Authorization Tag (AUTH_TAG)
 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 .macro GCM_COMPLETE
-   mov arg9, %r12# %r13 = aadLen (number of bytes)
+   movdqu AadHash(%arg2), %xmm8
+   movdqu HashKey(%rsp), %xmm13
+   mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
shl $3, %r12  # convert into number of bits
movd%r12d, %xmm15 # len(A) in %xmm15
-   shl $3, %arg5 # len(C) in bits (*128)
-   MOVQ_R64_XMM%arg5, %xmm1
+   mov InLen(%arg2), %r12
+   shl $3, %r12  # len(C) in bits (*128)
+   MOVQ_R64_XMM%r12, %xmm1
+
pslldq  $8, %xmm15# %xmm15 = len(A)||0x
pxor%xmm1, %xmm15 # %xmm15 = len(A)||len(C)
pxor%xmm15, %xmm8
@@ -364,8 +393,7 @@ _multiple_of_16_bytes_\@:
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm8
 
-   mov %arg6, %rax   # %rax = *Y0
-   movdqu  (%rax), %xmm0 # %xmm0 = Y0
+   movdqu OrigIV(%arg2), %xmm0   # %xmm0 = Y0
ENCRYPT_SINGLE_BLOCK%xmm0,  %xmm1 # E(K, Y0)
pxor%xmm8, %xmm0
 _return_T_\@:
@@ -553,15 

[PATCH 09/14] x86/crypto: aesni: Move ghash_mul to GCM_COMPLETE

2018-02-12 Thread Dave Watson
Prepare to handle partial blocks between scatter/gather calls.
For the last partial block, we only want to calculate the aadhash
in GCM_COMPLETE, and a new partial block macro will handle both
aadhash update and encrypting partial blocks between calls.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index aa82493..37b1cee 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -345,7 +345,6 @@ _zero_cipher_left_\@:
pxor%xmm0, %xmm8
 .endif
 
-   GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
movdqu %xmm8, AadHash(%arg2)
 .ifc \operation, enc
# GHASH computation for the last <16 byte block
@@ -378,6 +377,15 @@ _multiple_of_16_bytes_\@:
 .macro GCM_COMPLETE
movdqu AadHash(%arg2), %xmm8
movdqu HashKey(%rsp), %xmm13
+
+   mov PBlockLen(%arg2), %r12
+
+   cmp $0, %r12
+   je _partial_done\@
+
+   GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+
+_partial_done\@:
mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
shl $3, %r12  # convert into number of bits
movd%r12d, %xmm15 # len(A) in %xmm15
-- 
2.9.5



[PATCH 00/14] x86/crypto gcmaes SSE scatter/gather support

2018-02-12 Thread Dave Watson
This patch set refactors the x86 aes/gcm SSE crypto routines to
support true scatter/gather by adding gcm_enc/dec_update methods.

The layout is:

* First 5 patches refactor the code to use macros, so changes only
  need to be applied once for encode and decode.  There should be no
  functional changes.

* The next 6 patches introduce a gcm_context structure to be passed
  between scatter/gather calls to maintain state.  The struct is also
  used as scratch space for the existing enc/dec routines.

* The last 2 set up the asm function entry points for scatter gather
  support, and then call the new routines per buffer in the passed in
  sglist in aesni-intel_glue.

Testing: 
asm itself fuzz tested vs. existing code and isa-l asm.
Ran libkcapi test suite, passes.
Passes my TLS tests.
IPSec or testing of other aesni users would be appreciated.

perf of a large (16k messages) TLS sends sg vs. no sg:

no-sg

33287255597  cycles  
53702871176  instructions

43.47%   _crypt_by_4
17.83%   memcpy
16.36%   aes_loop_par_enc_done

sg

27568944591  cycles 
54580446678  instructions

49.87%   _crypt_by_4
17.40%   aes_loop_par_enc_done
1.79%aes_loop_initial_5416
1.52%aes_loop_initial_4974
1.27%gcmaes_encrypt_sg.constprop.15


Dave Watson (14):
  x86/crypto: aesni: Merge INITIAL_BLOCKS_ENC/DEC
  x86/crypto: aesni: Macro-ify func save/restore
  x86/crypto: aesni: Add GCM_INIT macro
  x86/crypto: aesni: Add GCM_COMPLETE macro
  x86/crypto: aesni: Merge encode and decode to GCM_ENC_DEC macro
  x86/crypto: aesni: Introduce gcm_context_data
  x86/crypto: aesni: Split AAD hash calculation to separate macro
  x86/crypto: aesni: Fill in new context data structures
  x86/crypto: aesni: Move ghash_mul to GCM_COMPLETE
  x86/crypto: aesni: Move HashKey computation from stack to gcm_context
  x86/crypto: aesni: Introduce partial block macro
  x86/crypto: aesni: Add fast path for > 16 byte update
  x86/crypto: aesni: Introduce scatter/gather asm function stubs
  x86/crypto: aesni: Update aesni-intel_glue to use scatter/gather

 arch/x86/crypto/aesni-intel_asm.S  | 1414 ++--
 arch/x86/crypto/aesni-intel_glue.c |  263 ++-
 2 files changed, 932 insertions(+), 745 deletions(-)

-- 
2.9.5



[PATCH 03/14] x86/crypto: aesni: Add GCM_INIT macro

2018-02-12 Thread Dave Watson
Reduce code duplication by introducting GCM_INIT macro.  This macro
will also be exposed as a function for implementing scatter/gather
support, since INIT only needs to be called once for the full
operation.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 84 +++
 1 file changed, 33 insertions(+), 51 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 39b42b1..b9fe2ab 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -191,6 +191,37 @@ ALL_F:  .octa 0x
pop %r12
 .endm
 
+
+# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
+# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
+.macro GCM_INIT
+   mov %arg6, %r12
+   movdqu  (%r12), %xmm13
+   movdqa  SHUF_MASK(%rip), %xmm2
+   PSHUFB_XMM %xmm2, %xmm13
+
+   # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+   movdqa  %xmm13, %xmm2
+   psllq   $1, %xmm13
+   psrlq   $63, %xmm2
+   movdqa  %xmm2, %xmm1
+   pslldq  $8, %xmm2
+   psrldq  $8, %xmm1
+   por %xmm2, %xmm13
+
+   # reduce HashKey<<1
+
+   pshufd  $0x24, %xmm1, %xmm2
+   pcmpeqd TWOONE(%rip), %xmm2
+   pandPOLY(%rip), %xmm2
+   pxor%xmm2, %xmm13
+   movdqa  %xmm13, HashKey(%rsp)
+   mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod 
poly)
+   and $-16, %r13
+   mov %r13, %r12
+.endm
+
 #ifdef __x86_64__
 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 *
@@ -1151,36 +1182,11 @@ _esb_loop_\@:
 */
 ENTRY(aesni_gcm_dec)
FUNC_SAVE
-   mov %arg6, %r12
-   movdqu  (%r12), %xmm13# %xmm13 = HashKey
-movdqa  SHUF_MASK(%rip), %xmm2
-   PSHUFB_XMM %xmm2, %xmm13
-
-
-# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
-
-   movdqa  %xmm13, %xmm2
-   psllq   $1, %xmm13
-   psrlq   $63, %xmm2
-   movdqa  %xmm2, %xmm1
-   pslldq  $8, %xmm2
-   psrldq  $8, %xmm1
-   por %xmm2, %xmm13
-
-# Reduction
-
-   pshufd  $0x24, %xmm1, %xmm2
-   pcmpeqd TWOONE(%rip), %xmm2
-   pandPOLY(%rip), %xmm2
-   pxor%xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
 
+   GCM_INIT
 
 # Decrypt first few blocks
 
-   movdqa %xmm13, HashKey(%rsp)   # store HashKey<<1 (mod poly)
-   mov %arg4, %r13# save the number of bytes of plaintext/ciphertext
-   and $-16, %r13  # %r13 = %r13 - (%r13 mod 16)
-   mov %r13, %r12
and $(3<<4), %r12
jz _initial_num_blocks_is_0_decrypt
cmp $(2<<4), %r12
@@ -1402,32 +1408,8 @@ ENDPROC(aesni_gcm_dec)
 ***/
 ENTRY(aesni_gcm_enc)
FUNC_SAVE
-   mov %arg6, %r12
-   movdqu  (%r12), %xmm13
-movdqa  SHUF_MASK(%rip), %xmm2
-   PSHUFB_XMM %xmm2, %xmm13
-
-# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
-
-   movdqa  %xmm13, %xmm2
-   psllq   $1, %xmm13
-   psrlq   $63, %xmm2
-   movdqa  %xmm2, %xmm1
-   pslldq  $8, %xmm2
-   psrldq  $8, %xmm1
-   por %xmm2, %xmm13
-
-# reduce HashKey<<1
-
-   pshufd  $0x24, %xmm1, %xmm2
-   pcmpeqd TWOONE(%rip), %xmm2
-   pandPOLY(%rip), %xmm2
-   pxor%xmm2, %xmm13
-   movdqa  %xmm13, HashKey(%rsp)
-   mov %arg4, %r13# %xmm13 holds HashKey<<1 (mod poly)
-   and $-16, %r13
-   mov %r13, %r12
 
+   GCM_INIT
 # Encrypt first few blocks
 
and $(3<<4), %r12
-- 
2.9.5



[PATCH 04/14] x86/crypto: aesni: Add GCM_COMPLETE macro

2018-02-12 Thread Dave Watson
Merge encode and decode tag calculations in GCM_COMPLETE macro.
Scatter/gather routines will call this once at the end of encryption
or decryption.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 172 ++
 1 file changed, 63 insertions(+), 109 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index b9fe2ab..529c542 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -222,6 +222,67 @@ ALL_F:  .octa 0x
mov %r13, %r12
 .endm
 
+# GCM_COMPLETE Finishes update of tag of last partial block
+# Output: Authorization Tag (AUTH_TAG)
+# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
+.macro GCM_COMPLETE
+   mov arg8, %r12# %r13 = aadLen (number of bytes)
+   shl $3, %r12  # convert into number of bits
+   movd%r12d, %xmm15 # len(A) in %xmm15
+   shl $3, %arg4 # len(C) in bits (*128)
+   MOVQ_R64_XMM%arg4, %xmm1
+   pslldq  $8, %xmm15# %xmm15 = len(A)||0x
+   pxor%xmm1, %xmm15 # %xmm15 = len(A)||len(C)
+   pxor%xmm15, %xmm8
+   GHASH_MUL   %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+   # final GHASH computation
+   movdqa SHUF_MASK(%rip), %xmm10
+   PSHUFB_XMM %xmm10, %xmm8
+
+   mov %arg5, %rax   # %rax = *Y0
+   movdqu  (%rax), %xmm0 # %xmm0 = Y0
+   ENCRYPT_SINGLE_BLOCK%xmm0,  %xmm1 # E(K, Y0)
+   pxor%xmm8, %xmm0
+_return_T_\@:
+   mov arg9, %r10 # %r10 = authTag
+   mov arg10, %r11# %r11 = auth_tag_len
+   cmp $16, %r11
+   je  _T_16_\@
+   cmp $8, %r11
+   jl  _T_4_\@
+_T_8_\@:
+   MOVQ_R64_XMM%xmm0, %rax
+   mov %rax, (%r10)
+   add $8, %r10
+   sub $8, %r11
+   psrldq  $8, %xmm0
+   cmp $0, %r11
+   je  _return_T_done_\@
+_T_4_\@:
+   movd%xmm0, %eax
+   mov %eax, (%r10)
+   add $4, %r10
+   sub $4, %r11
+   psrldq  $4, %xmm0
+   cmp $0, %r11
+   je  _return_T_done_\@
+_T_123_\@:
+   movd%xmm0, %eax
+   cmp $2, %r11
+   jl  _T_1_\@
+   mov %ax, (%r10)
+   cmp $2, %r11
+   je  _return_T_done_\@
+   add $2, %r10
+   sar $16, %eax
+_T_1_\@:
+   mov %al, (%r10)
+   jmp _return_T_done_\@
+_T_16_\@:
+   movdqu  %xmm0, (%r10)
+_return_T_done_\@:
+.endm
+
 #ifdef __x86_64__
 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 *
@@ -1271,61 +1332,7 @@ _less_than_8_bytes_left_decrypt:
sub $1, %r13
jne _less_than_8_bytes_left_decrypt
 _multiple_of_16_bytes_decrypt:
-   mov arg8, %r12# %r13 = aadLen (number of bytes)
-   shl $3, %r12  # convert into number of bits
-   movd%r12d, %xmm15 # len(A) in %xmm15
-   shl $3, %arg4 # len(C) in bits (*128)
-   MOVQ_R64_XMM%arg4, %xmm1
-   pslldq  $8, %xmm15# %xmm15 = len(A)||0x
-   pxor%xmm1, %xmm15 # %xmm15 = len(A)||len(C)
-   pxor%xmm15, %xmm8
-   GHASH_MUL   %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-# final GHASH computation
-movdqa SHUF_MASK(%rip), %xmm10
-   PSHUFB_XMM %xmm10, %xmm8
-
-   mov %arg5, %rax   # %rax = *Y0
-   movdqu  (%rax), %xmm0 # %xmm0 = Y0
-   ENCRYPT_SINGLE_BLOCK%xmm0,  %xmm1 # E(K, Y0)
-   pxor%xmm8, %xmm0
-_return_T_decrypt:
-   mov arg9, %r10# %r10 = authTag
-   mov arg10, %r11   # %r11 = auth_tag_len
-   cmp $16, %r11
-   je  _T_16_decrypt
-   cmp $8, %r11
-   jl  _T_4_decrypt
-_T_8_decrypt:
-   MOVQ_R64_XMM%xmm0, %rax
-   mov %rax, (%r10)
-   add $8, %r10
-   sub $8, %r11
-   psrldq  $8, %xmm0
-   cmp $0, %r11
-   je  _return_T_done_decrypt
-_T_4_decrypt:
-   movd%xmm0, %eax
-   mov %eax, (%r10)
-   add $4, %r10
-   sub $4, %r11
-   psrldq  $4, %xmm0
-   cmp $0, %r11
-   je  _return_T_done_decrypt
-_T_123_decrypt:
-   movd%xmm0, %eax
-   cmp $2, %r11
-   jl  _T_1_decrypt
-   mov %ax, (%r10)
-   cmp $2, %r11
-   je  _return_T_done_decrypt
-   add $2, %r10
-   sar $16, %eax
-_T_1_decrypt:
-   mov %al, (%r10)
-   jmp _return_T_done_decrypt
-_T_16_decrypt:
-   movdqu  %xmm0, (%r10)
-_return_T_done_decrypt:
+   GCM_COMPLETE
FUNC_RESTORE
ret
 E

[PATCH 02/14] x86/crypto: aesni: Macro-ify func save/restore

2018-02-12 Thread Dave Watson
Macro-ify function save and restore.  These will be used in new functions
added for scatter/gather update operations.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 53 ++-
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 48911fe..39b42b1 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -170,6 +170,26 @@ ALL_F:  .octa 0x
 #define TKEYP  T1
 #endif
 
+.macro FUNC_SAVE
+   push%r12
+   push%r13
+   push%r14
+   mov %rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+   sub $VARIABLE_OFFSET, %rsp
+   and $~63, %rsp
+.endm
+
+
+.macro FUNC_RESTORE
+   mov %r14, %rsp
+   pop %r14
+   pop %r13
+   pop %r12
+.endm
 
 #ifdef __x86_64__
 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
@@ -1130,16 +1150,7 @@ _esb_loop_\@:
 *
 */
 ENTRY(aesni_gcm_dec)
-   push%r12
-   push%r13
-   push%r14
-   mov %rsp, %r14
-/*
-* states of %xmm registers %xmm6:%xmm15 not saved
-* all %xmm registers are clobbered
-*/
-   sub $VARIABLE_OFFSET, %rsp
-   and $~63, %rsp# align rsp to 64 bytes
+   FUNC_SAVE
mov %arg6, %r12
movdqu  (%r12), %xmm13# %xmm13 = HashKey
 movdqa  SHUF_MASK(%rip), %xmm2
@@ -1309,10 +1320,7 @@ _T_1_decrypt:
 _T_16_decrypt:
movdqu  %xmm0, (%r10)
 _return_T_done_decrypt:
-   mov %r14, %rsp
-   pop %r14
-   pop %r13
-   pop %r12
+   FUNC_RESTORE
ret
 ENDPROC(aesni_gcm_dec)
 
@@ -1393,22 +1401,12 @@ ENDPROC(aesni_gcm_dec)
 * poly = x^128 + x^127 + x^126 + x^121 + 1
 ***/
 ENTRY(aesni_gcm_enc)
-   push%r12
-   push%r13
-   push%r14
-   mov %rsp, %r14
-#
-# states of %xmm registers %xmm6:%xmm15 not saved
-# all %xmm registers are clobbered
-#
-   sub $VARIABLE_OFFSET, %rsp
-   and $~63, %rsp
+   FUNC_SAVE
mov %arg6, %r12
movdqu  (%r12), %xmm13
 movdqa  SHUF_MASK(%rip), %xmm2
PSHUFB_XMM %xmm2, %xmm13
 
-
 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 
movdqa  %xmm13, %xmm2
@@ -1576,10 +1574,7 @@ _T_1_encrypt:
 _T_16_encrypt:
movdqu  %xmm0, (%r10)
 _return_T_done_encrypt:
-   mov %r14, %rsp
-   pop %r14
-   pop %r13
-   pop %r12
+   FUNC_RESTORE
ret
 ENDPROC(aesni_gcm_enc)
 
-- 
2.9.5



[PATCH 01/14] x86/crypto: aesni: Merge INITIAL_BLOCKS_ENC/DEC

2018-02-12 Thread Dave Watson
Use macro operations to merge implemetations of INITIAL_BLOCKS,
since they differ by only a small handful of lines.

Use macro counter \@ to simplify implementation.

Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 arch/x86/crypto/aesni-intel_asm.S | 298 ++
 1 file changed, 48 insertions(+), 250 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 76d8cd4..48911fe 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -275,234 +275,7 @@ _done_read_partial_block_\@:
 */
 
 
-.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 
XMM1 \
-XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
-MOVADQ SHUF_MASK(%rip), %xmm14
-   movarg7, %r10   # %r10 = AAD
-   movarg8, %r11   # %r11 = aadLen
-   pxor   %xmm\i, %xmm\i
-   pxor   \XMM2, \XMM2
-
-   cmp$16, %r11
-   jl _get_AAD_rest\num_initial_blocks\operation
-_get_AAD_blocks\num_initial_blocks\operation:
-   movdqu (%r10), %xmm\i
-   PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
-   pxor   %xmm\i, \XMM2
-   GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-   add$16, %r10
-   sub$16, %r11
-   cmp$16, %r11
-   jge_get_AAD_blocks\num_initial_blocks\operation
-
-   movdqu \XMM2, %xmm\i
-
-   /* read the last <16B of AAD */
-_get_AAD_rest\num_initial_blocks\operation:
-   cmp$0, %r11
-   je _get_AAD_done\num_initial_blocks\operation
-
-   READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
-   PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
-   pxor   \XMM2, %xmm\i
-   GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-
-_get_AAD_done\num_initial_blocks\operation:
-   xor%r11, %r11 # initialise the data pointer offset as zero
-   # start AES for num_initial_blocks blocks
-
-   mov%arg5, %rax  # %rax = *Y0
-   movdqu (%rax), \XMM0# XMM0 = Y0
-   PSHUFB_XMM   %xmm14, \XMM0
-
-.if (\i == 5) || (\i == 6) || (\i == 7)
-   MOVADQ  ONE(%RIP),\TMP1
-   MOVADQ  (%arg1),\TMP2
-.irpc index, \i_seq
-   paddd  \TMP1, \XMM0 # INCR Y0
-   movdqa \XMM0, %xmm\index
-   PSHUFB_XMM   %xmm14, %xmm\index  # perform a 16 byte swap
-   pxor   \TMP2, %xmm\index
-.endr
-   lea 0x10(%arg1),%r10
-   mov keysize,%eax
-   shr $2,%eax # 128->4, 192->6, 256->8
-   add $5,%eax   # 128->9, 192->11, 256->13
-
-aes_loop_initial_dec\num_initial_blocks:
-   MOVADQ  (%r10),\TMP1
-.irpc  index, \i_seq
-   AESENC  \TMP1, %xmm\index
-.endr
-   add $16,%r10
-   sub $1,%eax
-   jnz aes_loop_initial_dec\num_initial_blocks
-
-   MOVADQ  (%r10), \TMP1
-.irpc index, \i_seq
-   AESENCLAST \TMP1, %xmm\index # Last Round
-.endr
-.irpc index, \i_seq
-   movdqu (%arg3 , %r11, 1), \TMP1
-   pxor   \TMP1, %xmm\index
-   movdqu %xmm\index, (%arg2 , %r11, 1)
-   # write back plaintext/ciphertext for num_initial_blocks
-   add$16, %r11
-
-   movdqa \TMP1, %xmm\index
-   PSHUFB_XMM %xmm14, %xmm\index
-# prepare plaintext/ciphertext for GHASH computation
-.endr
-.endif
-
-# apply GHASH on num_initial_blocks blocks
-
-.if \i == 5
-pxor   %xmm5, %xmm6
-   GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-pxor   %xmm6, %xmm7
-   GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-pxor   %xmm7, %xmm8
-   GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 6
-pxor   %xmm6, %xmm7
-   GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-pxor   %xmm7, %xmm8
-   GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 7
-pxor   %xmm7, %xmm8
-   GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.endif
-   cmp$64, %r13
-   jl  _initial_blocks_done\num_initial_blocks\operation
-   # no need for precomputed values
-/*
-*
-* Precomputations for HashKey parallel with encryption of first 4 blocks.
-* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-*/
-   MOVADQ ONE(%rip), \TMP1
-   paddd  \TMP1, \XMM0  # INCR Y0
-   MOVADQ \XMM0, \XMM1
-   PSHUFB_XMM  %xmm14, \XMM1# perform a 16 byte swap
-
-   paddd  \TMP1, \XMM0  # INCR Y0
-   MOVADQ \XMM0, \XMM2
-   PSHUFB_XMM  %xmm14, \XMM2# perform a 16 byte swap
-
-   paddd  \TMP1, \XMM0  # INCR Y0
-   

Re: [PATCHv2] tls: Add support for encryption using async offload accelerator

2018-01-31 Thread Dave Watson
On 01/31/18 05:22 PM, Vakul Garg wrote:
> > > On second though in stable we should probably just disable async tfm
> > > allocations.
> > > It's simpler. But this approach is still good for -next
> > >
> > >
> > > Gilad
> > 
> > I agree with Gilad, just disable async for now.
> > 
> 
> How to do it? Can you help with the api name?

*aead = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);

https://github.com/ktls/net_next_ktls/commit/f3b9b402e755e4b0623fa83f88137173fc249f2d

> > If the flag MSG_DONTWAIT is set, we should be returning -EINPROGRESS and
> > not wait for a response.  I had started working on a patch for that, but 
> > it's
> > pretty tricky to get right.
> 
> Can you point me to your WIP code branch for this?

https://github.com/ktls/net_next_ktls/commit/9cc839aa551ed972d148ecebf353b25ee93543b9

> If MSG_DONTWAIT is not used, will it be sane if enqueue the crypto request to 
> accelerator and return to user space back so that user space can send more 
> plaintext data while 
> crypto accelerator is working in parallel?

Right, that's roughly what the above does.  I believe the tricky
unfinished part was getting poll() to work correctly if there is an
async crypto request outstanding.  Currently the tls poll() just
relies on backpressure from do_tcp_sendpages.


Re: [RFC crypto v3 8/9] chtls: Register the ULP

2018-01-31 Thread Dave Watson
On 01/31/18 04:14 PM, Atul Gupta wrote:
> 
> 
> On Tuesday 30 January 2018 10:41 PM, Dave Watson wrote:
> > On 01/30/18 06:51 AM, Atul Gupta wrote:
> > 
> > > What I was referring is that passing "tls" ulp type in setsockopt
> > > may be insufficient to make the decision when multi HW assist Inline
> > > TLS solution exists.
> > Setting the ULP doesn't choose HW or SW implementation, I think that
> > should be done later when setting up crypto with
> > 
> > setsockopt(SOL_TLS, TLS_TX, struct crypto_info).
> setsockpot [mentioned above] is quite late for driver to enable HW
> implementation, we require something as early as tls_init [setsockopt(sock,
> SOL_TCP, TCP_ULP, "tls", sizeof("tls"))], for driver to set HW prot and
> offload connection beside Inline Tx/Rx.
> > 
> > Any reason we can't use ethtool to choose HW vs SW implementation, if
> > available on the device?
> Thought about it,  the interface index is not available to fetch netdev and
> caps check to set HW prot eg. bind [prot.hash] --> tls_hash to program HW.

Perhaps this is the part I don't follow - why do you need to override
hash and check for LISTEN?  I briefly looked through the patch named
"CPL handler definition", this looks like it is a full TCP offload?


Re: [PATCHv2] tls: Add support for encryption using async offload accelerator

2018-01-31 Thread Dave Watson
On 01/31/18 09:34 PM, Vakul Garg wrote:
> Async crypto accelerators (e.g. drivers/crypto/caam) support offloading
> GCM operation. If they are enabled, crypto_aead_encrypt() return error
> code -EINPROGRESS. In this case tls_do_encryption() needs to wait on a
> completion till the time the response for crypto offload request is
> received.

Comments from V1
> On Wed, Jan 31, 2018 at 8:10 AM, Gilad Ben-Yossef  wrote:
>> Hi Vakul,
>>
>> On Wed, Jan 31, 2018 at 12:36 PM, Vakul Garg  wrote:
>>> Async crypto accelerators (e.g. drivers/crypto/caam) support offloading
>>> GCM operation. If they are enabled, crypto_aead_encrypt() return error
>>> code -EINPROGRESS. In this case tls_do_encryption() needs to wait on a
>>> completion till the time the response for crypto offload request is
>>> received.
>>>
>>
>> Thank you for this patch. I think it is actually a bug fix and should
>> probably go into stable
>
> On second though in stable we should probably just disable async tfm
> allocations.
> It's simpler. But this approach is still good for -next
>
>
> Gilad

I agree with Gilad, just disable async for now. 

If the flag MSG_DONTWAIT is set, we should be returning -EINPROGRESS
and not wait for a response.  I had started working on a patch for
that, but it's pretty tricky to get right.


Re: [RFC crypto v3 8/9] chtls: Register the ULP

2018-01-30 Thread Dave Watson
On 01/30/18 06:51 AM, Atul Gupta wrote:

> What I was referring is that passing "tls" ulp type in setsockopt
> may be insufficient to make the decision when multi HW assist Inline
> TLS solution exists.

Setting the ULP doesn't choose HW or SW implementation, I think that
should be done later when setting up crypto with 

setsockopt(SOL_TLS, TLS_TX, struct crypto_info).

Any reason we can't use ethtool to choose HW vs SW implementation, if
available on the device?

> Some HW may go beyond defining sendmsg/sendpage of the prot and
> require additional info to setup the env? Also, we need to keep
> vendor specific code out of tls_main.c i.e anything other than
> base/sw_tx prot perhaps go to hw driver.

Sure, but I think we can add hooks to tls_main to do this without a
new ULP.


Re: [RFC crypto v3 8/9] chtls: Register the ULP

2018-01-25 Thread Dave Watson
<1513769897-26945-1-git-send-email-atul.gu...@chelsio.com>

On 12/20/17 05:08 PM, Atul Gupta wrote:
> +static void __init chtls_init_ulp_ops(void)
> +{
> + chtls_base_prot = tcp_prot;
> + chtls_base_prot.hash= chtls_hash;
> + chtls_base_prot.unhash  = chtls_unhash;
> + chtls_base_prot.close   = chtls_lsk_close;
> +
> + chtls_cpl_prot  = chtls_base_prot;
> + chtls_init_rsk_ops(_cpl_prot, _rsk_ops,
> +_prot, PF_INET);
> + chtls_cpl_prot.close= chtls_close;
> + chtls_cpl_prot.disconnect   = chtls_disconnect;
> + chtls_cpl_prot.destroy  = chtls_destroy_sock;
> + chtls_cpl_prot.shutdown = chtls_shutdown;
> + chtls_cpl_prot.sendmsg  = chtls_sendmsg;
> + chtls_cpl_prot.recvmsg  = chtls_recvmsg;
> + chtls_cpl_prot.sendpage = chtls_sendpage;
> + chtls_cpl_prot.setsockopt   = chtls_setsockopt;
> + chtls_cpl_prot.getsockopt   = chtls_getsockopt;
> +}

Much of this file should go in tls_main.c, reusing as much as
possible. For example it doesn't look like the get/set sockopts have
changed at all for chtls.

> +
> +static int __init chtls_register(void)
> +{
> + chtls_init_ulp_ops();
> + register_listen_notifier(_notifier);
> + cxgb4_register_uld(CXGB4_ULD_TLS, _uld_info);
> + tcp_register_ulp(_chtls_ulp_ops);
> + return 0;
> +}
> +
> +static void __exit chtls_unregister(void)
> +{
> + unregister_listen_notifier(_notifier);
> + tcp_unregister_ulp(_chtls_ulp_ops);
> + chtls_free_all_uld();
> + cxgb4_unregister_uld(CXGB4_ULD_TLS);
> +}

The idea with ULP is that there is one ULP hook per protocol, 
not per driver.  


Re: [PATCH v3 net-next 1/4] tcp: ULP infrastructure

2017-07-31 Thread Dave Watson
On 07/29/17 01:12 PM, Tom Herbert wrote:
> On Wed, Jun 14, 2017 at 11:37 AM, Dave Watson <davejwat...@fb.com> wrote:
> > Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
> > sockets. Based on a similar infrastructure in tcp_cong.  The idea is that 
> > any
> > ULP can add its own logic by changing the TCP proto_ops structure to its own
> > methods.
> >
> > Example usage:
> >
> > setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
> >
> One question: is there a good reason why the ULP infrastructure should
> just be for TCP sockets. For example, I'd really like to be able
> something like:
> 
> setsockopt(sock, SOL_SOCKET, SO_ULP, _param, sizeof(ulp_param));
> 
> Where ulp_param is a structure containing the ULP name as well as some
> ULP specific parameters that are passed to init_ulp. ulp_init could
> determine whether the socket family is appropriate for the ULP being
> requested.

Using SOL_SOCKET instead seems reasonable to me.  I can see how
ulp_params could have some use, perhaps at a slight loss in clarity.
TLS needs its own setsockopts anyway though, for renegotiate for
example.


Re: [PATCH v3 net-next 3/4] tls: kernel TLS support

2017-07-12 Thread Dave Watson
On 07/12/17 09:20 AM, Steffen Klassert wrote:
> On Tue, Jul 11, 2017 at 11:53:11AM -0700, Dave Watson wrote:
> > On 07/11/17 08:29 AM, Steffen Klassert wrote:
> > > Sorry for replying to old mail...
> > > > +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
> > > > +{
> > > 
> > > ...
> > > 
> > > > +
> > > > +   if (!sw_ctx->aead_send) {
> > > > +   sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0);
> > > > +   if (IS_ERR(sw_ctx->aead_send)) {
> > > > +   rc = PTR_ERR(sw_ctx->aead_send);
> > > > +   sw_ctx->aead_send = NULL;
> > > > +   goto free_rec_seq;
> > > > +   }
> > > > +   }
> > > > +
> > > 
> > > When I look on how you allocate the aead transformation, it seems
> > > that you should either register an asynchronous callback with
> > > aead_request_set_callback(), or request for a synchronous algorithm.
> > > 
> > > Otherwise you will crash on an asynchronous crypto return, no?
> > 
> > The intention is for it to be synchronous, and gather directly from
> > userspace buffers.  It looks like calling
> > crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC) is the correct way
> > to request synchronous algorithms only?
> 
> Yes, but then you loose the aes-ni based algorithms because they are
> asynchronous. If you want to have good crypto performance, it is
> better to implement the asynchronous callbacks.

Right, the trick is we want both aesni, and to guarantee that we are
done using the input buffers before sendmsg() returns.  For now I can
set a callback, and wait on a completion.  The initial use case of
userspace openssl integration shouldn't hit the aesni async case
anyway (!irq_fpu_usable())


Re: [PATCH v3 net-next 3/4] tls: kernel TLS support

2017-07-11 Thread Dave Watson
On 07/11/17 08:29 AM, Steffen Klassert wrote:
> Sorry for replying to old mail...
> > +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
> > +{
> 
> ...
> 
> > +
> > +   if (!sw_ctx->aead_send) {
> > +   sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0);
> > +   if (IS_ERR(sw_ctx->aead_send)) {
> > +   rc = PTR_ERR(sw_ctx->aead_send);
> > +   sw_ctx->aead_send = NULL;
> > +   goto free_rec_seq;
> > +   }
> > +   }
> > +
> 
> When I look on how you allocate the aead transformation, it seems
> that you should either register an asynchronous callback with
> aead_request_set_callback(), or request for a synchronous algorithm.
> 
> Otherwise you will crash on an asynchronous crypto return, no?

The intention is for it to be synchronous, and gather directly from
userspace buffers.  It looks like calling
crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC) is the correct way
to request synchronous algorithms only?

> Also, it seems that you have your scatterlists on a per crypto
> transformation base istead of per crypto request. Is this intentional?

We hold the socket lock and only one crypto op can happen at a time,
so we reuse the scatterlists.


Re: [PATCH v3 net-next 0/4] kernel TLS

2017-07-06 Thread Dave Watson
Hi Richard, 

On 07/06/17 04:30 PM, Richard Weinberger wrote:
> Dave,
> 
> On Wed, Jun 14, 2017 at 8:36 PM, Dave Watson <davejwat...@fb.com> wrote:
> >  Documentation/networking/tls.txt   | 135 +++
> >  MAINTAINERS|  10 +
> >  include/linux/socket.h |   1 +
> >  include/net/inet_connection_sock.h |   4 +
> >  include/net/tcp.h  |  27 ++
> >  include/net/tls.h  | 237 
> >  include/uapi/linux/tcp.h   |   1 +
> >  include/uapi/linux/tls.h   |  79 
> >  net/Kconfig|   1 +
> >  net/Makefile   |   1 +
> >  net/ipv4/Makefile  |   2 +-
> >  net/ipv4/sysctl_net_ipv4.c |  25 ++
> >  net/ipv4/tcp.c |  33 +-
> >  net/ipv4/tcp_ipv4.c|   2 +
> >  net/ipv4/tcp_rate.c|   1 +
> >  net/ipv4/tcp_ulp.c | 134 +++
> >  net/tls/Kconfig|  12 +
> >  net/tls/Makefile   |   7 +
> >  net/tls/tls_main.c | 487 +++
> >  net/tls/tls_sw.c   | 772 
> > +
> >  20 files changed, 1968 insertions(+), 3 deletions(-)
> >  create mode 100644 Documentation/networking/tls.txt
> >  create mode 100644 include/net/tls.h
> >  create mode 100644 include/uapi/linux/tls.h
> >  create mode 100644 net/ipv4/tcp_ulp.c
> >  create mode 100644 net/tls/Kconfig
> >  create mode 100644 net/tls/Makefile
> >  create mode 100644 net/tls/tls_main.c
> >  create mode 100644 net/tls/tls_sw.c
> 
> Sorry for the late question. Do I miss something or is this IPv4 only?

The hooks it currently overrides / uses from proto_ops (sendmsg, sendpage,
get/setsockopt, close) are the same for ipv4 & ipv6, so it should work
for both.  Our test suites have been passing in both, at least.


Re: [PATCH v3 net-next 1/4] tcp: ULP infrastructure

2017-06-26 Thread Dave Watson
On 06/25/17 02:42 AM, Levin, Alexander (Sasha Levin) wrote:
> On Wed, Jun 14, 2017 at 11:37:14AM -0700, Dave Watson wrote:
> >Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
> >sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
> >ULP can add its own logic by changing the TCP proto_ops structure to its own
> >methods.
> >
> >Example usage:
> >
> >setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
> >
> >modules will call:
> >tcp_register_ulp(_tls_ulp_ops);
> >
> >to register/unregister their ulp, with an init function and name.
> >
> >A list of registered ulps will be returned by tcp_get_available_ulp, which is
> >hooked up to /proc.  Example:
> >
> >$ cat /proc/sys/net/ipv4/tcp_available_ulp
> >tls
> >
> >There is currently no functionality to remove or chain ULPs, but
> >it should be possible to add these in the future if needed.
> >
> >Signed-off-by: Boris Pismenny <bor...@mellanox.com>
> >Signed-off-by: Dave Watson <davejwat...@fb.com>
> 
> Hey Dave,
> 
> I'm seeing the following while fuzzing, which was bisected to this commit:
> 
> ==
> BUG: KASAN: null-ptr-deref in copy_to_user include/linux/uaccess.h:168 
> [inline]
> BUG: KASAN: null-ptr-deref in do_tcp_getsockopt.isra.33+0x24f/0x1e30 
> net/ipv4/tcp.c:3057
> Read of size 4 at addr 0020 by task syz-executor1/15452

At a glance, this looks like it was fixed already by

https://www.mail-archive.com/netdev@vger.kernel.org/msg175226.html

Can you recheck with that patch, or verify that you already have it?
Thanks.


Re: [PATCH v3 net-next 3/4] tls: kernel TLS support

2017-06-16 Thread Dave Watson
On 06/16/17 01:58 PM, Stephen Hemminger wrote:
> On Wed, 14 Jun 2017 11:37:39 -0700
> Dave Watson <davejwat...@fb.com> wrote:
> 
> > --- /dev/null
> > +++ b/net/tls/Kconfig
> > @@ -0,0 +1,12 @@
> > +#
> > +# TLS configuration
> > +#
> > +config TLS
> > +   tristate "Transport Layer Security support"
> > +   depends on NET
> > +   default m
> > +   ---help---
> > +   Enable kernel support for TLS protocol. This allows symmetric
> > +   encryption handling of the TLS protocol to be done in-kernel.
> > +
> > +   If unsure, say M.
> 
> I understand that this will be useful to lots of people and most distributions
> will enable it. But the defacto policy in kernel configuration has been that
> new features in kernel default to being disabled.

Sure, will send a patch to switch to default n.


Re: [PATCH v3 net-next 0/4] kernel TLS

2017-06-14 Thread Dave Watson
On 06/14/17 01:54 PM, Tom Herbert wrote:
> On Wed, Jun 14, 2017 at 11:36 AM, Dave Watson <davejwat...@fb.com> wrote:
> > This series adds support for kernel TLS encryption over TCP sockets.
> > A standard TCP socket is converted to a TLS socket using a setsockopt.
> > Only symmetric crypto is done in the kernel, as well as TLS record
> > framing.  The handshake remains in userspace, and the negotiated
> > cipher keys/iv are provided to the TCP socket.
> >
> I don't see support for TLS receive path in the kernel, only the send
> path. Am I missing something?

Correct, this is only TX.  Since it sounds likely some hardware might
only be able to offload TX, we decided to configure TX and RX
separately.  Using the OpenSSL patches, it should be transparent to
users even if only one side is offloaded.

The software RX patches exist but haven't been polished up yet.


Re: [PATCH v3 net-next 0/4] kernel TLS

2017-06-14 Thread Dave Watson
Hi Hannes, 

On 06/14/17 10:15 PM, Hannes Frederic Sowa wrote:
> one question for this patch set:
> 
> What is the reason for not allowing key updates for the TX path? I was
> always loud pointing out the problems with TLSv1.2 renegotiation and
> TLSv1.3 key update alerts. This patch set uses encryption in a
> synchronous way directly in the socket layer and thus wouldn't suffer
> from problems regarding updates of the key. My hunch is that you leave
> this option open so you can later on introduce asynchronous crypto which
> might be used on hardware? It looks also be doable in case of MSG_MORE.
> Otherwise by allowing key updates to the data path I would not see any
> problems with key updates in TLS.

I don't currently have any reasons to not support renegotation, we
just don't currently use it, so I didn't add support for it.  I don't
work on the hardware, but yes it looks like it would have to keep the
old keys around until everything sent using them has been acked.

> Anyway, this patch seems easy and maybe with key updates added later on
> doesn't seem to have any problems pointed out by me so far.

Indeed, it would be easy to flush any unencrypted data, and then
change the keys.


[PATCH v3 net-next 3/4] tls: kernel TLS support

2017-06-14 Thread Dave Watson
Software implementation of transport layer security, implemented using ULP
infrastructure.  tcp proto_ops are replaced with tls equivalents of sendmsg and
sendpage.

Only symmetric crypto is done in the kernel, keys are passed by setsockopt
after the handshake is complete.  All control messages are supported via CMSG
data - the actual symmetric encryption is the same, just the message type needs
to be passed separately.

For user API, please see Documentation patch.

Pieces that can be shared between hw and sw implementation
are in tls_main.c

Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Ilya Lesokhin <il...@mellanox.com>
Signed-off-by: Aviad Yehezkel <avia...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 MAINTAINERS  |  10 +
 include/linux/socket.h   |   1 +
 include/net/tls.h| 237 +++
 include/uapi/linux/tls.h |  79 +
 net/Kconfig  |   1 +
 net/Makefile |   1 +
 net/tls/Kconfig  |  12 +
 net/tls/Makefile |   7 +
 net/tls/tls_main.c   | 487 ++
 net/tls/tls_sw.c | 772 +++
 10 files changed, 1607 insertions(+)
 create mode 100644 include/net/tls.h
 create mode 100644 include/uapi/linux/tls.h
 create mode 100644 net/tls/Kconfig
 create mode 100644 net/tls/Makefile
 create mode 100644 net/tls/tls_main.c
 create mode 100644 net/tls/tls_sw.c

diff --git a/MAINTAINERS b/MAINTAINERS
index f4e682c..710af53 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8979,6 +8979,16 @@ F:   net/ipv6/
 F: include/net/ip*
 F: arch/x86/net/*
 
+NETWORKING [TLS]
+M: Ilya Lesokhin <il...@mellanox.com>
+M: Aviad Yehezkel <avia...@mellanox.com>
+M: Dave Watson <davejwat...@fb.com>
+L: net...@vger.kernel.org
+S: Maintained
+F: net/tls/*
+F: include/uapi/linux/tls.h
+F: include/net/tls.h
+
 NETWORKING [IPSEC]
 M: Steffen Klassert <steffen.klass...@secunet.com>
 M: Herbert Xu <herb...@gondor.apana.org.au>
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 0820274..8b13db5 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -334,6 +334,7 @@ struct ucred {
 #define SOL_ALG279
 #define SOL_NFC280
 #define SOL_KCM281
+#define SOL_TLS282
 
 /* IPX options */
 #define IPX_TYPE   1
diff --git a/include/net/tls.h b/include/net/tls.h
new file mode 100644
index 000..b89d397
--- /dev/null
+++ b/include/net/tls.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwat...@fb.com>. All rights 
reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TLS_OFFLOAD_H
+#define _TLS_OFFLOAD_H
+
+#include 
+
+#include 
+
+
+/* Maximum data size carried in a TLS record */
+#define TLS_MAX_PAYLOAD_SIZE   ((size_t)1 << 14)
+
+#define TLS_HEADER_SIZE5
+#define TLS_NONCE_OFFSET   TLS_HEADER_SIZE
+
+#define TLS_CRYPTO_INFO_READY(info)((info)->cipher_type)
+
+#define TLS_RECORD_TYPE_DATA   0x17
+
+#define TLS_AAD_SPACE_SIZE 13
+
+struct tls_sw_context {
+   struct crypto_aead *aead_send;
+
+   /* Sending context */
+   char aad_space[TLS_AAD_SPACE_SIZE];
+
+   unsigned int sg_plaintext_size;
+   int sg_plaintext_num_elem;
+   struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS];
+
+   unsigned int sg_encrypted_size;

[PATCH v3 net-next 4/4] tls: Documentation

2017-06-14 Thread Dave Watson
Add documentation for the tcp ULP tls interface.

Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 Documentation/networking/tls.txt | 135 +++
 1 file changed, 135 insertions(+)
 create mode 100644 Documentation/networking/tls.txt

diff --git a/Documentation/networking/tls.txt b/Documentation/networking/tls.txt
new file mode 100644
index 000..77ed006
--- /dev/null
+++ b/Documentation/networking/tls.txt
@@ -0,0 +1,135 @@
+Overview
+
+
+Transport Layer Security (TLS) is a Upper Layer Protocol (ULP) that runs over
+TCP. TLS provides end-to-end data integrity and confidentiality.
+
+User interface
+==
+
+Creating a TLS connection
+-
+
+First create a new TCP socket and set the TLS ULP.
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
+
+Setting the TLS ULP allows us to set/get TLS socket options. Currently
+only the symmetric encryption is handled in the kernel.  After the TLS
+handshake is complete, we have all the parameters required to move the
+data-path to the kernel. There is a separate socket option for moving
+the transmit and the receive into the kernel.
+
+  /* From linux/tls.h */
+  struct tls_crypto_info {
+  unsigned short version;
+  unsigned short cipher_type;
+  };
+
+  struct tls12_crypto_info_aes_gcm_128 {
+  struct tls_crypto_info info;
+  unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE];
+  unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
+  unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE];
+  unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE];
+  };
+
+
+  struct tls12_crypto_info_aes_gcm_128 crypto_info;
+
+  crypto_info.info.version = TLS_1_2_VERSION;
+  crypto_info.info.cipher_type = TLS_CIPHER_AES_GCM_128;
+  memcpy(crypto_info.iv, iv_write, TLS_CIPHER_AES_GCM_128_IV_SIZE);
+  memcpy(crypto_info.rec_seq, seq_number_write,
+   TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
+  memcpy(crypto_info.key, cipher_key_write, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+  memcpy(crypto_info.salt, implicit_iv_write, 
TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+
+  setsockopt(sock, SOL_TLS, TLS_TX, _info, sizeof(crypto_info));
+
+Sending TLS application data
+
+
+After setting the TLS_TX socket option all application data sent over this
+socket is encrypted using TLS and the parameters provided in the socket option.
+For example, we can send an encrypted hello world record as follows:
+
+  const char *msg = "hello world\n";
+  send(sock, msg, strlen(msg));
+
+send() data is directly encrypted from the userspace buffer provided
+to the encrypted kernel send buffer if possible.
+
+The sendfile system call will send the file's data over TLS records of maximum
+length (2^14).
+
+  file = open(filename, O_RDONLY);
+  fstat(file, );
+  sendfile(sock, file, , stat.st_size);
+
+TLS records are created and sent after each send() call, unless
+MSG_MORE is passed.  MSG_MORE will delay creation of a record until
+MSG_MORE is not passed, or the maximum record size is reached.
+
+The kernel will need to allocate a buffer for the encrypted data.
+This buffer is allocated at the time send() is called, such that
+either the entire send() call will return -ENOMEM (or block waiting
+for memory), or the encryption will always succeed.  If send() returns
+-ENOMEM and some data was left on the socket buffer from a previous
+call using MSG_MORE, the MSG_MORE data is left on the socket buffer.
+
+Send TLS control messages
+-
+
+Other than application data, TLS has control messages such as alert
+messages (record type 21) and handshake messages (record type 22), etc.
+These messages can be sent over the socket by providing the TLS record type
+via a CMSG. For example the following function sends @data of @length bytes
+using a record of type @record_type.
+
+/* send TLS control message using record_type */
+  static int klts_send_ctrl_message(int sock, unsigned char record_type,
+  void *data, size_t length)
+  {
+struct msghdr msg = {0};
+int cmsg_len = sizeof(record_type);
+struct cmsghdr *cmsg;
+char buf[CMSG_SPACE(cmsg_len)];
+struct iovec msg_iov;   /* Vector of data to send/receive into.  */
+
+msg.msg_control = buf;
+msg.msg_controllen = sizeof(buf);
+cmsg = CMSG_FIRSTHDR();
+cmsg->cmsg_level = SOL_TLS;
+cmsg->cmsg_type = TLS_SET_RECORD_TYPE;
+cmsg->cmsg_len = CMSG_LEN(cmsg_len);
+*CMSG_DATA(cmsg) = record_type;
+msg.msg_controllen = cmsg->cmsg_len;
+
+msg_iov.iov_base = data;
+msg_iov.iov_len = length;
+msg.msg_iov = _iov;
+msg.msg_iovlen = 1;
+
+return s

[PATCH v3 net-next 2/4] tcp: export do_tcp_sendpages and tcp_rate_check_app_limited functions

2017-06-14 Thread Dave Watson
Export do_tcp_sendpages and tcp_rate_check_app_limited, since tls will need to
sendpages while the socket is already locked.

tcp_sendpage is exported, but requires the socket lock to not be held already.

Signed-off-by: Aviad Yehezkel <avia...@mellanox.com>
Signed-off-by: Ilya Lesokhin <il...@mellanox.com>
Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tcp.h   | 2 ++
 net/ipv4/tcp.c  | 5 +++--
 net/ipv4/tcp_rate.c | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b439f46..e17ec28 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -350,6 +350,8 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
 int flags);
+ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+size_t size, int flags);
 void tcp_release_cb(struct sock *sk);
 void tcp_wfree(struct sk_buff *skb);
 void tcp_write_timer_handler(struct sock *sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b06ee30..11e4ee2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -901,8 +901,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, 
int flags)
return mss_now;
 }
 
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
-   size_t size, int flags)
+ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+size_t size, int flags)
 {
struct tcp_sock *tp = tcp_sk(sk);
int mss_now, size_goal;
@@ -1032,6 +1032,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct 
page *page, int offset,
}
return sk_stream_error(sk, flags, err);
 }
+EXPORT_SYMBOL_GPL(do_tcp_sendpages);
 
 int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 size_t size, int flags)
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index ad99569..3330a37 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -185,3 +185,4 @@ void tcp_rate_check_app_limited(struct sock *sk)
tp->app_limited =
(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
 }
+EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
-- 
2.9.3



[PATCH v3 net-next 1/4] tcp: ULP infrastructure

2017-06-14 Thread Dave Watson
Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
ULP can add its own logic by changing the TCP proto_ops structure to its own
methods.

Example usage:

setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));

modules will call:
tcp_register_ulp(_tls_ulp_ops);

to register/unregister their ulp, with an init function and name.

A list of registered ulps will be returned by tcp_get_available_ulp, which is
hooked up to /proc.  Example:

$ cat /proc/sys/net/ipv4/tcp_available_ulp
tls

There is currently no functionality to remove or chain ULPs, but
it should be possible to add these in the future if needed.

Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/inet_connection_sock.h |   4 ++
 include/net/tcp.h  |  25 +++
 include/uapi/linux/tcp.h   |   1 +
 net/ipv4/Makefile  |   2 +-
 net/ipv4/sysctl_net_ipv4.c |  25 +++
 net/ipv4/tcp.c |  28 
 net/ipv4/tcp_ipv4.c|   2 +
 net/ipv4/tcp_ulp.c | 134 +
 8 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv4/tcp_ulp.c

diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index c7a5779..13e4c89 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -75,6 +75,8 @@ struct inet_connection_sock_af_ops {
  * @icsk_pmtu_cookie  Last pmtu seen by socket
  * @icsk_ca_ops   Pluggable congestion control hook
  * @icsk_af_ops   Operations which are AF_INET{4,6} specific
+ * @icsk_ulp_ops  Pluggable ULP control hook
+ * @icsk_ulp_data ULP private data
  * @icsk_ca_state:Congestion control state
  * @icsk_retransmits: Number of unrecovered [RTO] timeouts
  * @icsk_pending: Scheduled timer event
@@ -97,6 +99,8 @@ struct inet_connection_sock {
__u32 icsk_pmtu_cookie;
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
+   const struct tcp_ulp_ops  *icsk_ulp_ops;
+   void  *icsk_ulp_data;
unsigned int  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8  icsk_ca_state:6,
  icsk_ca_setsockopt:1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3ab677d..b439f46 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1991,4 +1991,29 @@ static inline void tcp_listendrop(const struct sock *sk)
 
 enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 
+/*
+ * Interface for adding Upper Level Protocols over TCP
+ */
+
+#define TCP_ULP_NAME_MAX   16
+#define TCP_ULP_MAX128
+#define TCP_ULP_BUF_MAX(TCP_ULP_NAME_MAX*TCP_ULP_MAX)
+
+struct tcp_ulp_ops {
+   struct list_headlist;
+
+   /* initialize ulp */
+   int (*init)(struct sock *sk);
+   /* cleanup ulp */
+   void (*release)(struct sock *sk);
+
+   charname[TCP_ULP_NAME_MAX];
+   struct module   *owner;
+};
+int tcp_register_ulp(struct tcp_ulp_ops *type);
+void tcp_unregister_ulp(struct tcp_ulp_ops *type);
+int tcp_set_ulp(struct sock *sk, const char *name);
+void tcp_get_available_ulp(char *buf, size_t len);
+void tcp_cleanup_ulp(struct sock *sk);
+
 #endif /* _TCP_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 38a2b07..8204dce 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -117,6 +117,7 @@ enum {
 #define TCP_SAVED_SYN  28  /* Get SYN headers recorded for 
connection */
 #define TCP_REPAIR_WINDOW  29  /* Get/set window parameters */
 #define TCP_FASTOPEN_CONNECT   30  /* Attempt FastOpen with connect */
+#define TCP_ULP31  /* Attach a ULP to a TCP connection */
 
 struct tcp_repair_opt {
__u32   opt_code;
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f83de23..afcb435 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
 inet_timewait_sock.o inet_connection_sock.o \
 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
-tcp_rate.o tcp_recovery.o \
+tcp_rate.o tcp_recovery.o tcp_ulp.o \
 tcp_offload.o datagram.o raw.o udp.o udplite.o \
 udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7065234a..9bf8097 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sys

[PATCH v3 net-next 0/4] kernel TLS

2017-06-14 Thread Dave Watson
This series adds support for kernel TLS encryption over TCP sockets.
A standard TCP socket is converted to a TLS socket using a setsockopt.
Only symmetric crypto is done in the kernel, as well as TLS record
framing.  The handshake remains in userspace, and the negotiated
cipher keys/iv are provided to the TCP socket.

We implemented support for this API in OpenSSL 1.1.0, the code is
available at https://github.com/Mellanox/tls-openssl/tree/master

It should work with any TLS library with similar modifications,
a test tool using gnutls is here: https://github.com/Mellanox/tls-af_ktls_tool

RFC patch to openssl:
https://mta.openssl.org/pipermail/openssl-dev/2017-June/009384.html

Changes from V2:

* EXPORT_SYMBOL_GPL in patch 1
* Ensure cleanup code always called before sk_stream_kill_queues to
  avoid warnings

Changes from V1:

* EXPORT_SYMBOL GPL in patch 2
* Add link to OpenSSL patch & gnutls example in documentation patch.
* sk_write_pending check was rolled in to wait_for_memory path,
  avoids special case and fixes lock inbalance issue.
* Unify flag handling for sendmsg/sendfile

Changes from RFC V2:

* Generic ULP (upper layer protocol) framework instead of TLS specific
  setsockopts
* Dropped Mellanox hardware patches, will come as separate series.
  Framework will work for both.

RFC V2:

http://www.mail-archive.com/netdev@vger.kernel.org/msg160317.html

Changes from RFC V1:

* Socket based on changing TCP proto_ops instead of crypto framework
* Merged code with Mellanox's hardware tls offload
* Zerocopy sendmsg support added - sendpage/sendfile is no longer
  necessary for zerocopy optimization

RFC V1:

http://www.mail-archive.com/netdev@vger.kernel.org/msg88021.html

* Socket based on crypto userspace API framework, required two
  sockets in userspace, one encrypted, one unencrypted.

Paper: https://netdevconf.org/1.2/papers/ktls.pdf

Aviad Yehezkel (1):
  tcp: export do_tcp_sendpages and tcp_rate_check_app_limited functions

Boris Pismenny (2):
  tcp: ULP infrastructure
  tls: Documentation

Ilya Lesokhin (1):
  tls: kernel TLS support

 Documentation/networking/tls.txt   | 135 +++
 MAINTAINERS|  10 +
 include/linux/socket.h |   1 +
 include/net/inet_connection_sock.h |   4 +
 include/net/tcp.h  |  27 ++
 include/net/tls.h  | 237 
 include/uapi/linux/tcp.h   |   1 +
 include/uapi/linux/tls.h   |  79 
 net/Kconfig|   1 +
 net/Makefile   |   1 +
 net/ipv4/Makefile  |   2 +-
 net/ipv4/sysctl_net_ipv4.c |  25 ++
 net/ipv4/tcp.c |  33 +-
 net/ipv4/tcp_ipv4.c|   2 +
 net/ipv4/tcp_rate.c|   1 +
 net/ipv4/tcp_ulp.c | 134 +++
 net/tls/Kconfig|  12 +
 net/tls/Makefile   |   7 +
 net/tls/tls_main.c | 487 +++
 net/tls/tls_sw.c   | 772 +
 20 files changed, 1968 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/networking/tls.txt
 create mode 100644 include/net/tls.h
 create mode 100644 include/uapi/linux/tls.h
 create mode 100644 net/ipv4/tcp_ulp.c
 create mode 100644 net/tls/Kconfig
 create mode 100644 net/tls/Makefile
 create mode 100644 net/tls/tls_main.c
 create mode 100644 net/tls/tls_sw.c

-- 
2.9.3



[PATCH v2 net-next 4/4] tls: Documentation

2017-06-06 Thread Dave Watson
Add documentation for the tcp ULP tls interface.

Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 Documentation/networking/tls.txt | 135 +++
 1 file changed, 135 insertions(+)
 create mode 100644 Documentation/networking/tls.txt

diff --git a/Documentation/networking/tls.txt b/Documentation/networking/tls.txt
new file mode 100644
index 000..77ed006
--- /dev/null
+++ b/Documentation/networking/tls.txt
@@ -0,0 +1,135 @@
+Overview
+
+
+Transport Layer Security (TLS) is a Upper Layer Protocol (ULP) that runs over
+TCP. TLS provides end-to-end data integrity and confidentiality.
+
+User interface
+==
+
+Creating a TLS connection
+-
+
+First create a new TCP socket and set the TLS ULP.
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
+
+Setting the TLS ULP allows us to set/get TLS socket options. Currently
+only the symmetric encryption is handled in the kernel.  After the TLS
+handshake is complete, we have all the parameters required to move the
+data-path to the kernel. There is a separate socket option for moving
+the transmit and the receive into the kernel.
+
+  /* From linux/tls.h */
+  struct tls_crypto_info {
+  unsigned short version;
+  unsigned short cipher_type;
+  };
+
+  struct tls12_crypto_info_aes_gcm_128 {
+  struct tls_crypto_info info;
+  unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE];
+  unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
+  unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE];
+  unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE];
+  };
+
+
+  struct tls12_crypto_info_aes_gcm_128 crypto_info;
+
+  crypto_info.info.version = TLS_1_2_VERSION;
+  crypto_info.info.cipher_type = TLS_CIPHER_AES_GCM_128;
+  memcpy(crypto_info.iv, iv_write, TLS_CIPHER_AES_GCM_128_IV_SIZE);
+  memcpy(crypto_info.rec_seq, seq_number_write,
+   TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
+  memcpy(crypto_info.key, cipher_key_write, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+  memcpy(crypto_info.salt, implicit_iv_write, 
TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+
+  setsockopt(sock, SOL_TLS, TLS_TX, _info, sizeof(crypto_info));
+
+Sending TLS application data
+
+
+After setting the TLS_TX socket option all application data sent over this
+socket is encrypted using TLS and the parameters provided in the socket option.
+For example, we can send an encrypted hello world record as follows:
+
+  const char *msg = "hello world\n";
+  send(sock, msg, strlen(msg));
+
+send() data is directly encrypted from the userspace buffer provided
+to the encrypted kernel send buffer if possible.
+
+The sendfile system call will send the file's data over TLS records of maximum
+length (2^14).
+
+  file = open(filename, O_RDONLY);
+  fstat(file, );
+  sendfile(sock, file, , stat.st_size);
+
+TLS records are created and sent after each send() call, unless
+MSG_MORE is passed.  MSG_MORE will delay creation of a record until
+MSG_MORE is not passed, or the maximum record size is reached.
+
+The kernel will need to allocate a buffer for the encrypted data.
+This buffer is allocated at the time send() is called, such that
+either the entire send() call will return -ENOMEM (or block waiting
+for memory), or the encryption will always succeed.  If send() returns
+-ENOMEM and some data was left on the socket buffer from a previous
+call using MSG_MORE, the MSG_MORE data is left on the socket buffer.
+
+Send TLS control messages
+-
+
+Other than application data, TLS has control messages such as alert
+messages (record type 21) and handshake messages (record type 22), etc.
+These messages can be sent over the socket by providing the TLS record type
+via a CMSG. For example the following function sends @data of @length bytes
+using a record of type @record_type.
+
+/* send TLS control message using record_type */
+  static int klts_send_ctrl_message(int sock, unsigned char record_type,
+  void *data, size_t length)
+  {
+struct msghdr msg = {0};
+int cmsg_len = sizeof(record_type);
+struct cmsghdr *cmsg;
+char buf[CMSG_SPACE(cmsg_len)];
+struct iovec msg_iov;   /* Vector of data to send/receive into.  */
+
+msg.msg_control = buf;
+msg.msg_controllen = sizeof(buf);
+cmsg = CMSG_FIRSTHDR();
+cmsg->cmsg_level = SOL_TLS;
+cmsg->cmsg_type = TLS_SET_RECORD_TYPE;
+cmsg->cmsg_len = CMSG_LEN(cmsg_len);
+*CMSG_DATA(cmsg) = record_type;
+msg.msg_controllen = cmsg->cmsg_len;
+
+msg_iov.iov_base = data;
+msg_iov.iov_len = length;
+msg.msg_iov = _iov;
+msg.msg_iovlen = 1;
+
+return s

[PATCH v2 net-next 3/4] tls: kernel TLS support

2017-06-06 Thread Dave Watson
Software implementation of transport layer security, implemented using ULP
infrastructure.  tcp proto_ops are replaced with tls equivalents of sendmsg and
sendpage.

Only symmetric crypto is done in the kernel, keys are passed by setsockopt
after the handshake is complete.  All control messages are supported via CMSG
data - the actual symmetric encryption is the same, just the message type needs
to be passed separately.

For user API, please see Documentation patch.

Pieces that can be shared between hw and sw implementation
are in tls_main.c

Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Ilya Lesokhin <il...@mellanox.com>
Signed-off-by: Aviad Yehezkel <avia...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 MAINTAINERS  |  10 +
 include/linux/socket.h   |   1 +
 include/net/tls.h| 222 +
 include/uapi/linux/tls.h |  79 +
 net/Kconfig  |   1 +
 net/Makefile |   1 +
 net/tls/Kconfig  |  12 +
 net/tls/Makefile |   7 +
 net/tls/tls_main.c   | 485 +
 net/tls/tls_sw.c | 794 +++
 10 files changed, 1612 insertions(+)
 create mode 100644 include/net/tls.h
 create mode 100644 include/uapi/linux/tls.h
 create mode 100644 net/tls/Kconfig
 create mode 100644 net/tls/Makefile
 create mode 100644 net/tls/tls_main.c
 create mode 100644 net/tls/tls_sw.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 6b7625f..246ddd7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8973,6 +8973,16 @@ F:   net/ipv6/
 F: include/net/ip*
 F: arch/x86/net/*
 
+NETWORKING [TLS]
+M: Ilya Lesokhin <il...@mellanox.com>
+M: Aviad Yehezkel <avia...@mellanox.com>
+M: Dave Watson <davejwat...@fb.com>
+L: net...@vger.kernel.org
+S: Maintained
+F: net/tls/*
+F: include/uapi/linux/tls.h
+F: include/net/tls.h
+
 NETWORKING [IPSEC]
 M: Steffen Klassert <steffen.klass...@secunet.com>
 M: Herbert Xu <herb...@gondor.apana.org.au>
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 0820274..8b13db5 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -334,6 +334,7 @@ struct ucred {
 #define SOL_ALG279
 #define SOL_NFC280
 #define SOL_KCM281
+#define SOL_TLS282
 
 /* IPX options */
 #define IPX_TYPE   1
diff --git a/include/net/tls.h b/include/net/tls.h
new file mode 100644
index 000..b20fd2f
--- /dev/null
+++ b/include/net/tls.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwat...@fb.com>. All rights 
reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TLS_OFFLOAD_H
+#define _TLS_OFFLOAD_H
+
+#include 
+
+#include 
+
+
+/* Maximum data size carried in a TLS record */
+#define TLS_MAX_PAYLOAD_SIZE   ((size_t)1 << 14)
+
+#define TLS_HEADER_SIZE5
+#define TLS_NONCE_OFFSET   TLS_HEADER_SIZE
+
+#define TLS_CRYPTO_INFO_READY(info)((info)->cipher_type)
+
+#define TLS_RECORD_TYPE_DATA   0x17
+
+#define TLS_AAD_SPACE_SIZE 13
+
+struct tls_sw_context {
+   struct crypto_aead *aead_send;
+
+   /* Sending context */
+   char aad_space[TLS_AAD_SPACE_SIZE];
+
+   unsigned int sg_plaintext_size;
+   int sg_plaintext_num_elem;
+   struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS];
+
+   unsigned int sg_encrypted_size;

[PATCH v2 net-next 2/4] tcp: export do_tcp_sendpages and tcp_rate_check_app_limited functions

2017-06-06 Thread Dave Watson
Export do_tcp_sendpages and tcp_rate_check_app_limited, since tls will need to
sendpages while the socket is already locked.

tcp_sendpage is exported, but requires the socket lock to not be held already.

Signed-off-by: Aviad Yehezkel <avia...@mellanox.com>
Signed-off-by: Ilya Lesokhin <il...@mellanox.com>
Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tcp.h   | 2 ++
 net/ipv4/tcp.c  | 5 +++--
 net/ipv4/tcp_rate.c | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index fcc39f8..2b35100 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -353,6 +353,8 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
 int flags);
+ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+size_t size, int flags);
 void tcp_release_cb(struct sock *sk);
 void tcp_wfree(struct sk_buff *skb);
 void tcp_write_timer_handler(struct sock *sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0aa72cd..70efada 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -882,8 +882,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, 
int flags)
return mss_now;
 }
 
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
-   size_t size, int flags)
+ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+size_t size, int flags)
 {
struct tcp_sock *tp = tcp_sk(sk);
int mss_now, size_goal;
@@ -1013,6 +1013,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct 
page *page, int offset,
}
return sk_stream_error(sk, flags, err);
 }
+EXPORT_SYMBOL_GPL(do_tcp_sendpages);
 
 int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 size_t size, int flags)
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index ad99569..3330a37 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -185,3 +185,4 @@ void tcp_rate_check_app_limited(struct sock *sk)
tp->app_limited =
(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
 }
+EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
-- 
2.9.3



[PATCH v2 net-next 1/4] tcp: ULP infrastructure

2017-06-06 Thread Dave Watson
Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
ULP can add its own logic by changing the TCP proto_ops structure to its own
methods.

Example usage:

setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));

modules will call:
tcp_register_ulp(_tls_ulp_ops);

to register/unregister their ulp, with an init function and name.

A list of registered ulps will be returned by tcp_get_available_ulp, which is
hooked up to /proc.  Example:

$ cat /proc/sys/net/ipv4/tcp_available_ulp
tls

There is currently no functionality to remove or chain ULPs, but
it should be possible to add these in the future if needed.

Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/inet_connection_sock.h |   4 ++
 include/net/tcp.h  |  25 +++
 include/uapi/linux/tcp.h   |   1 +
 net/ipv4/Makefile  |   2 +-
 net/ipv4/sysctl_net_ipv4.c |  25 +++
 net/ipv4/tcp.c |  28 
 net/ipv4/tcp_ipv4.c|   2 +
 net/ipv4/tcp_ulp.c | 134 +
 8 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv4/tcp_ulp.c

diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index c7a5779..13e4c89 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -75,6 +75,8 @@ struct inet_connection_sock_af_ops {
  * @icsk_pmtu_cookie  Last pmtu seen by socket
  * @icsk_ca_ops   Pluggable congestion control hook
  * @icsk_af_ops   Operations which are AF_INET{4,6} specific
+ * @icsk_ulp_ops  Pluggable ULP control hook
+ * @icsk_ulp_data ULP private data
  * @icsk_ca_state:Congestion control state
  * @icsk_retransmits: Number of unrecovered [RTO] timeouts
  * @icsk_pending: Scheduled timer event
@@ -97,6 +99,8 @@ struct inet_connection_sock {
__u32 icsk_pmtu_cookie;
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
+   const struct tcp_ulp_ops  *icsk_ulp_ops;
+   void  *icsk_ulp_data;
unsigned int  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8  icsk_ca_state:6,
  icsk_ca_setsockopt:1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 82462db..fcc39f8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1992,4 +1992,29 @@ static inline void tcp_listendrop(const struct sock *sk)
 
 enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 
+/*
+ * Interface for adding Upper Level Protocols over TCP
+ */
+
+#define TCP_ULP_NAME_MAX   16
+#define TCP_ULP_MAX128
+#define TCP_ULP_BUF_MAX(TCP_ULP_NAME_MAX*TCP_ULP_MAX)
+
+struct tcp_ulp_ops {
+   struct list_headlist;
+
+   /* initialize ulp */
+   int (*init)(struct sock *sk);
+   /* cleanup ulp */
+   void (*release)(struct sock *sk);
+
+   charname[TCP_ULP_NAME_MAX];
+   struct module   *owner;
+};
+int tcp_register_ulp(struct tcp_ulp_ops *type);
+void tcp_unregister_ulp(struct tcp_ulp_ops *type);
+int tcp_set_ulp(struct sock *sk, const char *name);
+void tcp_get_available_ulp(char *buf, size_t len);
+void tcp_cleanup_ulp(struct sock *sk);
+
 #endif /* _TCP_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 38a2b07..8204dce 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -117,6 +117,7 @@ enum {
 #define TCP_SAVED_SYN  28  /* Get SYN headers recorded for 
connection */
 #define TCP_REPAIR_WINDOW  29  /* Get/set window parameters */
 #define TCP_FASTOPEN_CONNECT   30  /* Attempt FastOpen with connect */
+#define TCP_ULP31  /* Attach a ULP to a TCP connection */
 
 struct tcp_repair_opt {
__u32   opt_code;
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f83de23..afcb435 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
 inet_timewait_sock.o inet_connection_sock.o \
 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
-tcp_rate.o tcp_recovery.o \
+tcp_rate.o tcp_recovery.o tcp_ulp.o \
 tcp_offload.o datagram.o raw.o udp.o udplite.o \
 udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 86957e9..6a40837c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sys

[PATCH v2 net-next 0/4] kernel TLS

2017-06-06 Thread Dave Watson
This series adds support for kernel TLS encryption over TCP sockets.
A standard TCP socket is converted to a TLS socket using a setsockopt.
Only symmetric crypto is done in the kernel, as well as TLS record
framing.  The handshake remains in userspace, and the negotiated
cipher keys/iv are provided to the TCP socket.

We implemented support for this API in OpenSSL 1.1.0, the code is
available at https://github.com/Mellanox/tls-openssl/tree/master

It should work with any TLS library with similar modifications,
a test tool using gnutls is here: https://github.com/Mellanox/tls-af_ktls_tool

Changes from V1:

* EXPORT_SYMBOL GPL in patch 2
* Add link to OpenSSL patch & gnutls example in documentation patch.
* sk_write_pending check was rolled in to wait_for_memory path,
  avoids special case and fixes lock inbalance issue.
* Unify flag handling for sendmsg/sendfile

Changes from RFC V2:

* Generic ULP (upper layer protocol) framework instead of TLS specific
  setsockopts
* Dropped Mellanox hardware patches, will come as separate series.
  Framework will work for both.

RFC V2:

http://www.mail-archive.com/netdev@vger.kernel.org/msg160317.html

Changes from RFC V1:

* Socket based on changing TCP proto_ops instead of crypto framework
* Merged code with Mellanox's hardware tls offload
* Zerocopy sendmsg support added - sendpage/sendfile is no longer
  necessary for zerocopy optimization

RFC V1:

http://www.mail-archive.com/netdev@vger.kernel.org/msg88021.html

* Socket based on crypto userspace API framework, required two
  sockets in userspace, one encrypted, one unencrypted.

Paper: https://netdevconf.org/1.2/papers/ktls.pdf

Aviad Yehezkel (1):
  tcp: export do_tcp_sendpages and tcp_rate_check_app_limited functions

Boris Pismenny (2):
  tcp: ULP infrastructure
  tls: Documentation

Ilya Lesokhin (1):
  tls: kernel TLS support

 Documentation/networking/tls.txt   | 135 +++
 MAINTAINERS|  10 +
 include/linux/socket.h |   1 +
 include/net/inet_connection_sock.h |   4 +
 include/net/tcp.h  |  27 ++
 include/net/tls.h  | 222 +++
 include/uapi/linux/tcp.h   |   1 +
 include/uapi/linux/tls.h   |  79 
 net/Kconfig|   1 +
 net/Makefile   |   1 +
 net/ipv4/Makefile  |   2 +-
 net/ipv4/sysctl_net_ipv4.c |  25 ++
 net/ipv4/tcp.c |  33 +-
 net/ipv4/tcp_ipv4.c|   2 +
 net/ipv4/tcp_rate.c|   1 +
 net/ipv4/tcp_ulp.c | 134 +++
 net/tls/Kconfig|  12 +
 net/tls/Makefile   |   7 +
 net/tls/tls_main.c | 485 ++
 net/tls/tls_sw.c   | 794 +
 20 files changed, 1973 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/networking/tls.txt
 create mode 100644 include/net/tls.h
 create mode 100644 include/uapi/linux/tls.h
 create mode 100644 net/ipv4/tcp_ulp.c
 create mode 100644 net/tls/Kconfig
 create mode 100644 net/tls/Makefile
 create mode 100644 net/tls/tls_main.c
 create mode 100644 net/tls/tls_sw.c

-- 
2.9.3



[PATCH net-next 2/4] tcp: export do_tcp_sendpages and tcp_rate_check_app_limited functions

2017-05-24 Thread Dave Watson
Export do_tcp_sendpages and tcp_rate_check_app_limited, since tls will need to
sendpages while the socket is already locked.

tcp_sendpage is exported, but requires the socket lock to not be held already.

Signed-off-by: Aviad Yehezkel <avia...@mellanox.com>
Signed-off-by: Ilya Lesokhin <il...@mellanox.com>
Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/tcp.h   | 2 ++
 net/ipv4/tcp.c  | 5 +++--
 net/ipv4/tcp_rate.c | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index fcc39f8..2b35100 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -353,6 +353,8 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
 int flags);
+ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+size_t size, int flags);
 void tcp_release_cb(struct sock *sk);
 void tcp_wfree(struct sk_buff *skb);
 void tcp_write_timer_handler(struct sock *sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9f06faa..08a8ef4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -882,8 +882,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, 
int flags)
return mss_now;
 }
 
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
-   size_t size, int flags)
+ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+size_t size, int flags)
 {
struct tcp_sock *tp = tcp_sk(sk);
int mss_now, size_goal;
@@ -1013,6 +1013,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct 
page *page, int offset,
}
return sk_stream_error(sk, flags, err);
 }
+EXPORT_SYMBOL(do_tcp_sendpages);
 
 int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 size_t size, int flags)
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index ad99569..62876e4 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -185,3 +185,4 @@ void tcp_rate_check_app_limited(struct sock *sk)
tp->app_limited =
(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
 }
+EXPORT_SYMBOL(tcp_rate_check_app_limited);
-- 
2.9.3



[PATCH net-next 4/4] tls: Documentation

2017-05-24 Thread Dave Watson
Add documentation for the tcp ULP tls interface.

Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 Documentation/networking/tls.txt | 120 +++
 1 file changed, 120 insertions(+)
 create mode 100644 Documentation/networking/tls.txt

diff --git a/Documentation/networking/tls.txt b/Documentation/networking/tls.txt
new file mode 100644
index 000..7bfb256
--- /dev/null
+++ b/Documentation/networking/tls.txt
@@ -0,0 +1,120 @@
+Overview
+
+
+Transport Layer Security (TLS) is a Upper Layer Protocol (ULP) that runs over
+TCP. TLS provides end-to-end data integrity and confidentiality.
+
+User interface
+==
+
+Creating a TLS connection
+-
+
+First create a new TCP socket and set the TLS ULP.
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
+
+Setting the TLS ULP allows us to set/get TLS socket options. Currently
+only the symmetric encryption is handled in the kernel.  After the TLS
+handshake is complete, we have all the parameters required to move the
+data-path to the kernel. There is a separate socket option for moving
+the transmit and the receive into the kernel.
+
+  /* From linux/tls.h */
+  struct tls_crypto_info {
+  unsigned short version;
+  unsigned short cipher_type;
+  };
+
+  struct tls12_crypto_info_aes_gcm_128 {
+  struct tls_crypto_info info;
+  unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE];
+  unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
+  unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE];
+  unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE];
+  };
+
+
+  struct tls12_crypto_info_aes_gcm_128 crypto_info;
+
+  crypto_info.info.version = TLS_1_2_VERSION;
+  crypto_info.info.cipher_type = TLS_CIPHER_AES_GCM_128;
+  memcpy(crypto_info.iv, iv_write, TLS_CIPHER_AES_GCM_128_IV_SIZE);
+  memcpy(crypto_info.rec_seq, seq_number_write,
+   TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
+  memcpy(crypto_info.key, cipher_key_write, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+  memcpy(crypto_info.salt, implicit_iv_write, 
TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+
+  setsockopt(sock, SOL_TLS, TLS_TX, _info, sizeof(crypto_info));
+
+Sending TLS application data
+
+
+After setting the TLS_TX socket option all application data sent over this
+socket is encrypted using TLS and the parameters provided in the socket option.
+For example, we can send an encrypted hello world record as follows:
+
+  const char *msg = "hello world\n";
+  send(sock, msg, strlen(msg));
+
+send() data is directly encrypted from the userspace buffer provided
+to the encrypted kernel send buffer if possible.
+
+The sendfile system call will send the file's data over TLS records of maximum
+length (2^14).
+
+  file = open(filename, O_RDONLY);
+  fstat(file, );
+  sendfile(sock, file, , stat.st_size);
+
+TLS records are created and sent after each send() call, unless
+MSG_MORE is passed.  MSG_MORE will delay creation of a record until
+MSG_MORE is not passed, or the maximum record size is reached.
+
+The kernel will need to allocate a buffer for the encrypted data.
+This buffer is allocated at the time send() is called, such that
+either the entire send() call will return -ENOMEM (or block waiting
+for memory), or the encryption will always succeed.  If send() returns
+-ENOMEM and some data was left on the socket buffer from a previous
+call using MSG_MORE, the MSG_MORE data is left on the socket buffer.
+
+Send TLS control messages
+-
+
+Other than application data, TLS has control messages such as alert
+messages (record type 21) and handshake messages (record type 22), etc.
+These messages can be sent over the socket by providing the TLS record type
+via a CMSG. For example the following function sends @data of @length bytes
+using a record of type @record_type.
+
+/* send TLS control message using record_type */
+  static int klts_send_ctrl_message(int sock, unsigned char record_type,
+  void *data, size_t length)
+  {
+struct msghdr msg = {0};
+int cmsg_len = sizeof(record_type);
+struct cmsghdr *cmsg;
+char buf[CMSG_SPACE(cmsg_len)];
+struct iovec msg_iov;   /* Vector of data to send/receive into.  */
+
+msg.msg_control = buf;
+msg.msg_controllen = sizeof(buf);
+cmsg = CMSG_FIRSTHDR();
+cmsg->cmsg_level = SOL_TLS;
+cmsg->cmsg_type = TLS_SET_RECORD_TYPE;
+cmsg->cmsg_len = CMSG_LEN(cmsg_len);
+*CMSG_DATA(cmsg) = record_type;
+msg.msg_controllen = cmsg->cmsg_len;
+
+msg_iov.iov_base = data;
+msg_iov.iov_len = length;
+msg.msg_iov = _iov;
+msg.msg_iovlen = 1;
+
+return s

[PATCH net-next 3/4] tls: kernel TLS support

2017-05-24 Thread Dave Watson
Software implementation of transport layer security, implemented using ULP
infrastructure.  tcp proto_ops are replaced with tls equivalents of sendmsg and
sendpage.

Only symmetric crypto is done in the kernel, keys are passed by setsockopt
after the handshake is complete.  All control messages are supported via CMSG
data - the actual symmetric encryption is the same, just the message type needs
to be passed separately.

For user API, please see Documentation patch.

Pieces that can be shared between hw and sw implementation
are in tls_main.c

Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Ilya Lesokhin <il...@mellanox.com>
Signed-off-by: Aviad Yehezkel <avia...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 MAINTAINERS  |  10 +
 include/linux/socket.h   |   1 +
 include/net/tls.h| 223 ++
 include/uapi/linux/tls.h |  79 +
 net/Kconfig  |   1 +
 net/Makefile |   1 +
 net/tls/Kconfig  |  12 +
 net/tls/Makefile |   7 +
 net/tls/tls_main.c   | 450 +++
 net/tls/tls_sw.c | 788 +++
 10 files changed, 1572 insertions(+)
 create mode 100644 include/net/tls.h
 create mode 100644 include/uapi/linux/tls.h
 create mode 100644 net/tls/Kconfig
 create mode 100644 net/tls/Makefile
 create mode 100644 net/tls/tls_main.c
 create mode 100644 net/tls/tls_sw.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 9e98464..94bdbe8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8944,6 +8944,16 @@ F:   net/ipv6/
 F: include/net/ip*
 F: arch/x86/net/*
 
+NETWORKING [TLS]
+M: Ilya Lesokhin <il...@mellanox.com>
+M: Aviad Yehezkel <avia...@mellanox.com>
+M: Dave Watson <davejwat...@fb.com>
+L: net...@vger.kernel.org
+S: Maintained
+F: net/tls/*
+F: include/uapi/linux/tls.h
+F: include/net/tls.h
+
 NETWORKING [IPSEC]
 M: Steffen Klassert <steffen.klass...@secunet.com>
 M: Herbert Xu <herb...@gondor.apana.org.au>
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 0820274..8b13db5 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -334,6 +334,7 @@ struct ucred {
 #define SOL_ALG279
 #define SOL_NFC280
 #define SOL_KCM281
+#define SOL_TLS282
 
 /* IPX options */
 #define IPX_TYPE   1
diff --git a/include/net/tls.h b/include/net/tls.h
new file mode 100644
index 000..eee6ddf
--- /dev/null
+++ b/include/net/tls.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwat...@fb.com>. All rights 
reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TLS_OFFLOAD_H
+#define _TLS_OFFLOAD_H
+
+#include 
+
+#include 
+
+
+/* Maximum data size carried in a TLS record */
+#define TLS_MAX_PAYLOAD_SIZE   ((size_t)1 << 14)
+
+#define TLS_HEADER_SIZE5
+#define TLS_NONCE_OFFSET   TLS_HEADER_SIZE
+
+#define TLS_CRYPTO_INFO_READY(info)((info)->cipher_type)
+
+#define TLS_RECORD_TYPE_DATA   0x17
+
+#define TLS_AAD_SPACE_SIZE 13
+
+struct tls_sw_context {
+   struct crypto_aead *aead_send;
+
+   /* Sending context */
+   char aad_space[TLS_AAD_SPACE_SIZE];
+
+   unsigned int sg_plaintext_size;
+   int sg_plaintext_num_elem;
+   struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS];
+
+   unsigned int sg_encrypted_size;

[PATCH net-next 1/4] tcp: ULP infrastructure

2017-05-24 Thread Dave Watson
Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
ULP can add its own logic by changing the TCP proto_ops structure to its own
methods.

Example usage:

setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));

modules will call:
tcp_register_ulp(_tls_ulp_ops);

to register/unregister their ulp, with an init function and name.

A list of registered ulps will be returned by tcp_get_available_ulp, which is
hooked up to /proc.  Example:

$ cat /proc/sys/net/ipv4/tcp_available_ulp
tls

There is currently no functionality to remove or chain ULPs, but
it should be possible to add these in the future if needed.

Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Dave Watson <davejwat...@fb.com>
---
 include/net/inet_connection_sock.h |   4 ++
 include/net/tcp.h  |  25 +++
 include/uapi/linux/tcp.h   |   1 +
 net/ipv4/Makefile  |   2 +-
 net/ipv4/sysctl_net_ipv4.c |  25 +++
 net/ipv4/tcp.c |  28 
 net/ipv4/tcp_ipv4.c|   2 +
 net/ipv4/tcp_ulp.c | 134 +
 8 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv4/tcp_ulp.c

diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index c7a5779..13e4c89 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -75,6 +75,8 @@ struct inet_connection_sock_af_ops {
  * @icsk_pmtu_cookie  Last pmtu seen by socket
  * @icsk_ca_ops   Pluggable congestion control hook
  * @icsk_af_ops   Operations which are AF_INET{4,6} specific
+ * @icsk_ulp_ops  Pluggable ULP control hook
+ * @icsk_ulp_data ULP private data
  * @icsk_ca_state:Congestion control state
  * @icsk_retransmits: Number of unrecovered [RTO] timeouts
  * @icsk_pending: Scheduled timer event
@@ -97,6 +99,8 @@ struct inet_connection_sock {
__u32 icsk_pmtu_cookie;
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
+   const struct tcp_ulp_ops  *icsk_ulp_ops;
+   void  *icsk_ulp_data;
unsigned int  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8  icsk_ca_state:6,
  icsk_ca_setsockopt:1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 82462db..fcc39f8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1992,4 +1992,29 @@ static inline void tcp_listendrop(const struct sock *sk)
 
 enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 
+/*
+ * Interface for adding Upper Level Protocols over TCP
+ */
+
+#define TCP_ULP_NAME_MAX   16
+#define TCP_ULP_MAX128
+#define TCP_ULP_BUF_MAX(TCP_ULP_NAME_MAX*TCP_ULP_MAX)
+
+struct tcp_ulp_ops {
+   struct list_headlist;
+
+   /* initialize ulp */
+   int (*init)(struct sock *sk);
+   /* cleanup ulp */
+   void (*release)(struct sock *sk);
+
+   charname[TCP_ULP_NAME_MAX];
+   struct module   *owner;
+};
+int tcp_register_ulp(struct tcp_ulp_ops *type);
+void tcp_unregister_ulp(struct tcp_ulp_ops *type);
+int tcp_set_ulp(struct sock *sk, const char *name);
+void tcp_get_available_ulp(char *buf, size_t len);
+void tcp_cleanup_ulp(struct sock *sk);
+
 #endif /* _TCP_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 38a2b07..8204dce 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -117,6 +117,7 @@ enum {
 #define TCP_SAVED_SYN  28  /* Get SYN headers recorded for 
connection */
 #define TCP_REPAIR_WINDOW  29  /* Get/set window parameters */
 #define TCP_FASTOPEN_CONNECT   30  /* Attempt FastOpen with connect */
+#define TCP_ULP31  /* Attach a ULP to a TCP connection */
 
 struct tcp_repair_opt {
__u32   opt_code;
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f83de23..afcb435 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
 inet_timewait_sock.o inet_connection_sock.o \
 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
-tcp_rate.o tcp_recovery.o \
+tcp_rate.o tcp_recovery.o tcp_ulp.o \
 tcp_offload.o datagram.o raw.o udp.o udplite.o \
 udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 86957e9..6a40837c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sys

[PATCH net-next 0/4] kernel TLS

2017-05-24 Thread Dave Watson
This series adds support for kernel TLS encryption over TCP sockets.
A standard TCP socket is converted to a TLS socket using a setsockopt.
Only symmetric crypto is done in the kernel, as well as TLS record
framing.  The handshake remains in userspace, and the negotiated
cipher keys/iv are provided to the TCP socket.

We implemented support for this API in OpenSSL 1.1.0, the code is
available at https://github.com/Mellanox/tls-openssl/tree/master

It should work with any TLS library with similar modifications,
a test tool using gnutls is here: https://github.com/Mellanox/tls-af_ktls_tool

Changes from RFC V2:

* Generic ULP (upper layer protocol) framework instead of TLS specific
  setsockopts
* Dropped Mellanox hardware patches, will come as separate series.
  Framework will work for both.

RFC V2:

http://www.mail-archive.com/netdev@vger.kernel.org/msg160317.html

Changes from RFC V1:

* Socket based on changing TCP proto_ops instead of crypto framework
* Merged code with Mellanox's hardware tls offload
* Zerocopy sendmsg support added - sendpage/sendfile is no longer
  necessary for zerocopy optimization

RFC V1:

http://www.mail-archive.com/netdev@vger.kernel.org/msg88021.html

* Socket based on crypto userspace API framework, required two
  sockets in userspace, one encrypted, one unencrypted.

Paper: https://netdevconf.org/1.2/papers/ktls.pdf

Aviad Yehezkel (1):
  tcp: export do_tcp_sendpages and tcp_rate_check_app_limited functions

Boris Pismenny (2):
  tcp: ULP infrastructure
  tls: Documentation

Ilya Lesokhin (1):
  tls: kernel TLS support

 Documentation/networking/tls.txt   | 120 ++
 MAINTAINERS|  10 +
 include/linux/socket.h |   1 +
 include/net/inet_connection_sock.h |   4 +
 include/net/tcp.h  |  27 ++
 include/net/tls.h  | 223 +++
 include/uapi/linux/tcp.h   |   1 +
 include/uapi/linux/tls.h   |  79 
 net/Kconfig|   1 +
 net/Makefile   |   1 +
 net/ipv4/Makefile  |   2 +-
 net/ipv4/sysctl_net_ipv4.c |  25 ++
 net/ipv4/tcp.c |  33 +-
 net/ipv4/tcp_ipv4.c|   2 +
 net/ipv4/tcp_rate.c|   1 +
 net/ipv4/tcp_ulp.c | 134 +++
 net/tls/Kconfig|  12 +
 net/tls/Makefile   |   7 +
 net/tls/tls_main.c | 450 +
 net/tls/tls_sw.c   | 788 +
 20 files changed, 1918 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/networking/tls.txt
 create mode 100644 include/net/tls.h
 create mode 100644 include/uapi/linux/tls.h
 create mode 100644 net/ipv4/tcp_ulp.c
 create mode 100644 net/tls/Kconfig
 create mode 100644 net/tls/Makefile
 create mode 100644 net/tls/tls_main.c
 create mode 100644 net/tls/tls_sw.c

-- 
2.9.3



Re: [PATCH] crypto: aesni-intel - RFC4106 can zero copy when !PageHighMem

2016-12-13 Thread Dave Watson
On 12/13/16 04:32 PM, Ilya Lesokhin wrote:
> --- a/arch/x86/crypto/aesni-intel_glue.c
> +++ b/arch/x86/crypto/aesni-intel_glue.c
> @@ -903,9 +903,11 @@ static int helper_rfc4106_encrypt(struct aead_request 
> *req)
>   *((__be32 *)(iv+12)) = counter;
>  
>   if (sg_is_last(req->src) &&
> - req->src->offset + req->src->length <= PAGE_SIZE &&
> + (!PageHighMem(sg_page(req->src)) ||
> + req->src->offset + req->src->length <= PAGE_SIZE) &&
>   sg_is_last(req->dst) &&
> - req->dst->offset + req->dst->length <= PAGE_SIZE) {
> + (!PageHighMem(sg_page(req->dst)) ||
> + req->dst->offset + req->dst->length <= PAGE_SIZE)) {
>   one_entry_in_sg = 1;
>   scatterwalk_start(_sg_walk, req->src);
>   assoc = scatterwalk_map(_sg_walk);

I was also experimenting with a similar patch that loosened up the
restrictions here, checking for highmem.  Note that you can go even
further and check the AAD, data, and TAG all separately, the current
aesni crypto routines take them as separate buffers.  (This might fix
the RFC5288 patch AAD size issue?)

Long term it would be nice to improve the asm routines instead to
support scatter / gather IO and any AAD len, as the newer intel
routines do:

https://github.com/01org/isa-l_crypto/tree/master/aes
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 2/2] Crypto kernel tls socket

2015-11-23 Thread Dave Watson
Userspace crypto interface for TLS.  Currently supports gcm(aes) 128bit only,
however the interface is the same as the rest of the SOCK_ALG interface, so it
should be possible to add more without any user interface changes.

Currently gcm(aes) represents ~80% of our SSL connections.

Userspace interface:

1) A transform and op socket are created using the userspace crypto interface
2) Setsockopt ALG_SET_AUTHSIZE is called
3) Setsockopt ALG_SET_KEY is called twice, since we need both send/recv keys
4) ALG_SET_IV cmsgs are sent twice, since we need both send/recv IVs.
   To support userspace heartbeats, changeciphersuite, etc, we would also need
   to get these back out, use them, then reset them via CMSG.
5) ALG_SET_OP cmsg is overloaded to mean FD to read/write from.

Example program:

https://github.com/djwatson/ktls

At a high level, this could be implemented on TCP sockets directly instead with
various tradeoffs.

The userspace crypto interface might benefit from some interface
tweaking to deal with multiple keys / ivs better.  The crypto accept()
op socket interface isn't a great fit, since there are never multiple
parallel operations.

There's also some questions around using skbuffs instead of scatterlists for
send/recv, and if we are buffering on recv, when we should be decrypting the
data.
---
 crypto/Kconfig |   12 +
 crypto/Makefile|1 +
 crypto/algif_tls.c | 1233 
 3 files changed, 1246 insertions(+)
 create mode 100644 crypto/algif_tls.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 7240821..c15638a 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1639,6 +1639,18 @@ config CRYPTO_USER_API_AEAD
  This option enables the user-spaces interface for AEAD
  cipher algorithms.

+config CRYPTO_USER_API_TLS
+   tristate "User-space interface for TLS net sockets"
+   depends on NET
+   select CRYPTO_AEAD
+   select CRYPTO_USER_API
+   help
+ This option enables kernel TLS socket framing
+ cipher algorithms.  TLS framing is added/removed and
+  chained to a TCP socket.  Handshake is done in
+  userspace.
+
+
 config CRYPTO_HASH_INFO
bool

diff --git a/crypto/Makefile b/crypto/Makefile
index f7aba92..fc26012 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -121,6 +121,7 @@ obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
 obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
+obj-$(CONFIG_CRYPTO_USER_API_TLS) += algif_tls.o

 #
 # generic algorithms and the async_tx api
diff --git a/crypto/algif_tls.c b/crypto/algif_tls.c
new file mode 100644
index 000..123ade3
--- /dev/null
+++ b/crypto/algif_tls.c
@@ -0,0 +1,1233 @@
+/*
+ * algif_tls: User-space interface for TLS
+ *
+ * Copyright (C) 2015, Dave Watson <davejwat...@fb.com>
+ *
+ * This file provides the user-space API for AEAD ciphers.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define TLS_HEADER_SIZE 13
+#define TLS_TAG_SIZE 16
+#define TLS_IV_SIZE 8
+#define TLS_PADDED_AADLEN 16
+#define TLS_MAX_MESSAGE_LEN (1 << 14)
+
+/* Bytes not included in tls msg size field */
+#define TLS_FRAMING_SIZE 5
+
+#define TLS_APPLICATION_DATA_MSG 0x17
+#define TLS_VERSION 3
+
+struct tls_tfm_pair {
+   struct crypto_aead *tfm_send;
+   struct crypto_aead *tfm_recv;
+   int cur_setkey;
+};
+
+static struct workqueue_struct *tls_wq;
+
+struct tls_sg_list {
+   unsigned int cur;
+   struct scatterlist sg[ALG_MAX_PAGES];
+};
+
+#define RSGL_MAX_ENTRIES ALG_MAX_PAGES
+
+struct tls_ctx {
+   /* Send and encrypted transmit buffers */
+   struct tls_sg_list tsgl;
+   struct scatterlist tcsgl[ALG_MAX_PAGES];
+
+   /* Encrypted receive and receive buffers. */
+   struct tls_sg_list rcsgl;
+   struct af_alg_sgl rsgl[RSGL_MAX_ENTRIES];
+
+   /* Sequence numbers. */
+   int iv_set;
+   void *iv_send;
+   void *iv_recv;
+
+   struct af_alg_completion completion;
+
+   /* Bytes to send */
+   unsigned long used;
+
+   /* padded */
+   size_t aead_assoclen;
+   /* unpadded */
+   size_t assoclen;
+   struct aead_request aead_req;
+   struct aead_request aead_resp;
+
+   bool more;
+   bool merge;
+
+   /* Chained TCP socket */
+   struct sock *sock;
+   struct socket *socket;
+
+   void (*save_data_ready)(struct sock *sk);
+   void (*save_write_space)(struct sock *sk);
+   void (*save_state_change)(struct sock *sk);
+   stru

[RFC PATCH 0/2] Crypto kernel TLS socket

2015-11-23 Thread Dave Watson
An approach for a kernel TLS socket.

Only the symmetric encryption / decryption is done in-kernel, as well
as minimal framing handling.  The handshake is kept in userspace, and
the negotiated cipher / keys / IVs are then set on the algif_tls
socket, which is then hooked in to a tcp socket using
sk_write_space/sk_data_ready hooks.

If a non application-data TLS record is seen, it is left on the TCP
socket and an error is returned on the ALG socket, and the record is
left for userspace to manage. Userspace can't ignore the message, but
could just close the socket.

TLS could potentially also be done directly on the TCP socket, but
seemed a bit harder to work with the OOB data for non application_data
messages, and the sockopts / CMSGS already exist for ALG sockets.  The
flip side is having to manage two fds in userspace.

Some reasons we're looking at this:

1) Access to sendfile/splice for CDN-type applications.  We were
   inspired by Netflix exploring this in FreeBSD

   https://people.freebsd.org/~rrs/asiabsd_2015_tls.pdf

   For perf, this patch is almost on par with userspace OpenSSL.
   Currently there are some copies and allocs to support
   scatter/gather in aesni-intel_glue.c, but with some extra work to
   remove those (not included here), a sendfile() is faster than the
   equivalent userspace read/SSL_write using a 128k buffer by 2~7%.

2) Access to the unencrypted bytes in kernelspace.  For example, Tom
   Herbert's kcm would need this

   https://lwn.net/Articles/657999/

3) NIC offload. To support running aesni routines on the NIC instead
   of the processor, we would probably need enough of the framing
   interface put in kernel.


Dave Watson (2):
  Crypto support aesni rfc5288
  Crypto kernel tls socket

 arch/x86/crypto/aesni-intel_asm.S|6 +
 arch/x86/crypto/aesni-intel_avx-x86_64.S |4 +
 arch/x86/crypto/aesni-intel_glue.c   |  105 ++-
 crypto/Kconfig   |   12 +
 crypto/Makefile  |1 +
 crypto/algif_tls.c   | 1233 ++
 6 files changed, 1334 insertions(+), 27 deletions(-)
 create mode 100644 crypto/algif_tls.c

--
2.4.6
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 1/2] Crypto support aesni rfc5288

2015-11-23 Thread Dave Watson
Support rfc5288 using intel aesni routines.  See also rfc5246.

AAD length is 13 bytes padded out to 16. Padding bytes have to be
passed in in scatterlist currently, which probably isn't quite the
right fix.

The assoclen checks were moved to the individual rfc stubs, and the
common routines support all assoc lengths.

---
 arch/x86/crypto/aesni-intel_asm.S|   6 ++
 arch/x86/crypto/aesni-intel_avx-x86_64.S |   4 ++
 arch/x86/crypto/aesni-intel_glue.c   | 105 +++
 3 files changed, 88 insertions(+), 27 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 6bd2c6c..49667c4 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -228,6 +228,9 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 MOVADQ SHUF_MASK(%rip), %xmm14
movarg7, %r10   # %r10 = AAD
movarg8, %r12   # %r12 = aadLen
+   add$3, %r12
+   and$~3, %r12
+
mov%r12, %r11
pxor   %xmm\i, %xmm\i

@@ -453,6 +456,9 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 MOVADQ SHUF_MASK(%rip), %xmm14
movarg7, %r10   # %r10 = AAD
movarg8, %r12   # %r12 = aadLen
+   add$3, %r12
+   and$~3, %r12
+
mov%r12, %r11
pxor   %xmm\i, %xmm\i
 _get_AAD_loop\num_initial_blocks\operation:
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S 
b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 522ab68..0756e4a 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -360,6 +360,8 @@ VARIABLE_OFFSET = 16*8

 mov arg6, %r10  # r10 = AAD
 mov arg7, %r12  # r12 = aadLen
+add $3, %r12
+and $~3, %r12


 mov %r12, %r11
@@ -1619,6 +1621,8 @@ ENDPROC(aesni_gcm_dec_avx_gen2)

 mov arg6, %r10   # r10 = AAD
 mov arg7, %r12   # r12 = aadLen
+add $3, %r12
+and $~3, %r12


 mov %r12, %r11
diff --git a/arch/x86/crypto/aesni-intel_glue.c 
b/arch/x86/crypto/aesni-intel_glue.c
index 3633ad6..00a42ca 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -949,12 +949,7 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
struct scatter_walk src_sg_walk;
struct scatter_walk dst_sg_walk;
unsigned int i;
-
-   /* Assuming we are supporting rfc4106 64-bit extended */
-   /* sequence numbers We need to have the AAD length equal */
-   /* to 16 or 20 bytes */
-   if (unlikely(req->assoclen != 16 && req->assoclen != 20))
-   return -EINVAL;
+   unsigned int padded_assoclen = (req->assoclen + 3) & ~3;

/* IV below built */
for (i = 0; i < 4; i++)
@@ -970,21 +965,21 @@ static int helper_rfc4106_encrypt(struct aead_request 
*req)
one_entry_in_sg = 1;
scatterwalk_start(_sg_walk, req->src);
assoc = scatterwalk_map(_sg_walk);
-   src = assoc + req->assoclen;
+   src = assoc + padded_assoclen;
dst = src;
if (unlikely(req->src != req->dst)) {
scatterwalk_start(_sg_walk, req->dst);
-   dst = scatterwalk_map(_sg_walk) + req->assoclen;
+   dst = scatterwalk_map(_sg_walk) + padded_assoclen;
}
} else {
/* Allocate memory for src, dst, assoc */
-   assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+   assoc = kmalloc(req->cryptlen + auth_tag_len + padded_assoclen,
GFP_ATOMIC);
if (unlikely(!assoc))
return -ENOMEM;
scatterwalk_map_and_copy(assoc, req->src, 0,
-req->assoclen + req->cryptlen, 0);
-   src = assoc + req->assoclen;
+padded_assoclen + req->cryptlen, 0);
+   src = assoc + padded_assoclen;
dst = src;
}

@@ -998,7 +993,7 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
 * back to the packet. */
if (one_entry_in_sg) {
if (unlikely(req->src != req->dst)) {
-   scatterwalk_unmap(dst - req->assoclen);
+   scatterwalk_unmap(dst - padded_assoclen);
scatterwalk_advance(_sg_walk, req->dst->length);
scatterwalk_done(_sg_walk, 1, 0);
}
@@ -1006,7 +1001,7 @@ static int helper_rfc4106_encrypt(struct aead_request 
*req)
scatterwalk_advance(_sg_walk, req->src->length);
scatterwalk_done(_sg_walk, 

Re: [RFC PATCH 2/2] Crypto kernel tls socket

2015-11-23 Thread Dave Watson
On 11/23/15 02:27 PM, Sowmini Varadhan wrote:
> On (11/23/15 09:43), Dave Watson wrote:
> > Currently gcm(aes) represents ~80% of our SSL connections.
> >
> > Userspace interface:
> >
> > 1) A transform and op socket are created using the userspace crypto 
> > interface
> > 2) Setsockopt ALG_SET_AUTHSIZE is called
> > 3) Setsockopt ALG_SET_KEY is called twice, since we need both send/recv keys
> > 4) ALG_SET_IV cmsgs are sent twice, since we need both send/recv IVs.
> >To support userspace heartbeats, changeciphersuite, etc, we would also 
> > need
> >to get these back out, use them, then reset them via CMSG.
> > 5) ALG_SET_OP cmsg is overloaded to mean FD to read/write from.
>
> [from patch 0/2:]
> > If a non application-data TLS record is seen, it is left on the TCP
> > socket and an error is returned on the ALG socket, and the record is
> > left for userspace to manage.
>
> I'm trying to see how your approach would fit with the RDS-type of
> use-case. RDS-TCP is mostly similar in concept to kcm,
> except that rds has its own header for multiplexing, and has no
> dependancy on BPF for basic things like re-assembling the datagram.
> If I were to try to use this for RDS-TCP, the tls_tcp_read_sock() logic
> would be merged into the recv_actor callback for RDS, right?  Thus tls
> control-plane message could be seen in the middle of the
> data-stream, so we really have to freeze the processing of the data
> stream till the control-plane message is processed?

Correct.

> In the tls.c example that you have, the opfd is generated from
> the accept() on the AF_ALG socket- how would this work if I wanted
> my opfd to be a PF_RDS or a PF_KCM or similar?

For kcm, opfd is the fd you would pass along in kcm_attach.

For rds, it looks like you'd want to use opfd as the sock instead of
the new one created by sock_create_kern in rds_tcp_conn_connect.

> One concern is that this patchset provides a solution for the "80%"
> case but what about the other 20% (and the non x86 platforms)?

Almost all the rest are aes sha.  The actual encrypt / decrypt code
would be similar to this previous patch:

http://marc.info/?l=linux-kernel=140662647602192=2

The software routines in gcm(aes) should work for all platforms
without aesni.

> E.g., if I get a cipher-suite request outside the aes-ni, what would
> happen (punt to uspace?)
>
> --Sowmini

Right, bind() would fail and you would fallback to uspace.
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html