[email protected] (Niels Möller) writes:

> (iii) I've considered doing it earlier, to make it easier to implement
>       aes without a round loop (like for all current versions of
>       aes-encrypt-internal.*). E.g., on x86_64, for aes128 we could load
>       all subkeys into registers and still have registers left to do two
>       or more blocks in parallel, but then we'd need to override
>       aes128_encrypt separately from the other aes*_encrypt.

I've given this a try, see experimental patch below. It adds a
x86_64/aesni/aes128-encrypt.asm, with a 2-way loop. It gives a very
modest speedup, 5%, when I benchmark on my laptop (which is now a pretty
fast machine, AMD Ryzen 5). I've also added a cbc-aes128-encrypt.asm.
That gives more significant speedup, almost 60%. I think main reason for
the speedup is that we avoid reloading subkeys between blocks.

If we want to go this way, I wonder how to do it without an explosion of
files and functions. For s390x, it seems each function will be very
small, but not so for most other archs. There are at least three modes
that are similar to cbc encrypt in that they have to process blocks
sequentially, with no parallelism: CBC encrypt, CMAC, and XTS (there may
be more). It's not so nice if we need (modes × ciphers) number of assembly
files, with lots of duplication.

Regards,
/Niels

diff --git a/ChangeLog b/ChangeLog
index 3d19b1dd..68b8f632 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,13 @@
 2021-04-01  Niels Möller  <[email protected]>
 
+       * cbc-aes128-encrypt.c (nettle_cbc_aes128_encrypt): New file and 
function.
+       * x86_64/aesni/cbc-aes128-encrypt.asm: New file.
+
+       * configure.ac (asm_replace_list): Add aes128-encrypt.asm
+       aes128-decrypt.asm.
+       * x86_64/aesni/aes128-encrypt.asm: New file, with 2-way loop.
+       * x86_64/aesni/aes128-decrypt.asm: Likewise.
+
        Move aes128_encrypt and similar functions to their own files. To
        make it easier for assembly implementations to override specific
        AES variants.
diff --git a/Makefile.in b/Makefile.in
index 8d474d1e..b6b983fd 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -101,7 +101,8 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c 
aes-decrypt-table.c \
                 camellia256-set-encrypt-key.c camellia256-crypt.c \
                 camellia256-set-decrypt-key.c \
                 camellia256-meta.c \
-                cast128.c cast128-meta.c cbc.c \
+                cast128.c cast128-meta.c \
+                cbc.c cbc-aes128-encrypt.c \
                 ccm.c ccm-aes128.c ccm-aes192.c ccm-aes256.c cfb.c \
                 siv-cmac.c siv-cmac-aes128.c siv-cmac-aes256.c \
                 cnd-memcpy.c \
diff --git a/cbc-aes128-encrypt.c b/cbc-aes128-encrypt.c
new file mode 100644
index 00000000..5f7d1c8c
--- /dev/null
+++ b/cbc-aes128-encrypt.c
@@ -0,0 +1,42 @@
+/* cbc-aes128-encrypt.c
+
+   Copyright (C) 2013, 2014 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "cbc.h"
+
+void
+nettle_cbc_aes128_encrypt(struct cbc_aes128_ctx *ctx, size_t length, uint8_t 
*dst, const uint8_t *src)
+{
+  CBC_ENCRYPT(ctx, aes128_encrypt, length, dst, src);
+}
diff --git a/cbc.h b/cbc.h
index 93b2e739..beece610 100644
--- a/cbc.h
+++ b/cbc.h
@@ -35,6 +35,7 @@
 #define NETTLE_CBC_H_INCLUDED
 
 #include "nettle-types.h"
+#include "aes.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -79,6 +80,10 @@ memcpy((ctx)->iv, (data), sizeof((ctx)->iv))
                 sizeof((self)->iv), (self)->iv,        \
                 (length), (dst), (src)))
 
+struct cbc_aes128_ctx CBC_CTX(struct aes128_ctx, AES_BLOCK_SIZE);
+void
+nettle_cbc_aes128_encrypt(struct cbc_aes128_ctx *ctx, size_t length, uint8_t 
*dst, const uint8_t *src);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/configure.ac b/configure.ac
index be2916c1..26e41d89 100644
--- a/configure.ac
+++ b/configure.ac
@@ -544,6 +544,7 @@ fi
 # Files which replace a C source file (or otherwise don't correspond
 # to a new object file).
 asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
+               aes128-encrypt.asm aes128-decrypt.asm cbc-aes128-encrypt.asm \
                arcfour-crypt.asm camellia-crypt-internal.asm \
                md5-compress.asm memxor.asm memxor3.asm \
                poly1305-internal.asm \
diff --git a/examples/nettle-benchmark.c b/examples/nettle-benchmark.c
index 9ce3a733..686cf3b9 100644
--- a/examples/nettle-benchmark.c
+++ b/examples/nettle-benchmark.c
@@ -240,6 +240,21 @@ bench_ctr(void *arg)
            BENCH_BLOCK, info->dst, info->src);
 }
 
+struct bench_cbc_aes128_info
+{
+  struct cbc_aes128_ctx ctx;
+
+  const uint8_t *src;
+  uint8_t *dst;
+};
+
+static void
+bench_cbc_aes128(void *arg)
+{
+  struct bench_cbc_aes128_info *info = arg;
+  nettle_cbc_aes128_encrypt(&info->ctx, BENCH_BLOCK, info->dst, info->src);
+}
+
 struct bench_aead_info
 {
   void *ctx;
@@ -740,6 +755,29 @@ time_cipher(const struct nettle_cipher *cipher)
   free(key);
 }
 
+static void
+time_cbc_aes128(void)
+{
+  struct bench_cbc_aes128_info info;
+  uint8_t key[AES128_KEY_SIZE];
+  uint8_t iv[AES_BLOCK_SIZE];
+
+  static uint8_t src_data[BENCH_BLOCK];
+  static uint8_t data[BENCH_BLOCK];
+
+  init_key(sizeof(key), key);
+  init_key(sizeof(iv), iv);
+  init_data(data);
+  init_data(src_data);
+
+  aes128_set_encrypt_key(&info.ctx.ctx, key);
+  CBC_SET_IV(&info.ctx, iv);
+  info.src = src_data;
+  info.dst = data;
+  display("aes128", "new cbc", AES_BLOCK_SIZE,
+         time_function(bench_cbc_aes128, &info));
+}
+
 static void
 time_aead(const struct nettle_aead *aead)
 {
@@ -1027,6 +1065,9 @@ main(int argc, char **argv)
       if (!alg || strstr ("hmac-sha512", alg))
        time_hmac_sha512();
 
+      if (!alg || strstr ("cbc-aes128", alg))
+       time_cbc_aes128();
+
       optind++;
     } while (alg && argv[optind]);
 
diff --git a/testsuite/cbc-test.c b/testsuite/cbc-test.c
index 9394f1cb..ff0c4cbe 100644
--- a/testsuite/cbc-test.c
+++ b/testsuite/cbc-test.c
@@ -3,6 +3,43 @@
 #include "cbc.h"
 #include "knuth-lfib.h"
 
+static void
+test_cbc_aes128(const struct tstring *key,
+               const struct tstring *cleartext,
+               const struct tstring *ciphertext,
+               const struct tstring *iiv)
+{
+  struct cbc_aes128_ctx ctx;
+  uint8_t *data;
+  size_t length;
+
+  ASSERT (cleartext->length == ciphertext->length);
+  length = cleartext->length;
+
+  ASSERT (key->length == AES128_KEY_SIZE);
+  ASSERT (iiv->length == AES_BLOCK_SIZE);
+
+  data = xalloc(length);
+  aes128_set_encrypt_key(&ctx.ctx, key->data);
+  CBC_SET_IV(&ctx, iiv->data);
+
+  nettle_cbc_aes128_encrypt(&ctx,
+                           length, data, cleartext->data);
+
+  if (!MEMEQ(length, data, ciphertext->data))
+    {
+      fprintf(stderr, "CBC encrypt failed:\nInput:");
+      tstring_print_hex(cleartext);
+      fprintf(stderr, "\nOutput: ");
+      print_hex(length, data);
+      fprintf(stderr, "\nExpected:");
+      tstring_print_hex(ciphertext);
+      fprintf(stderr, "\n");
+      FAIL();
+    }
+  free(data);
+}
+
 /* Test with more data and inplace decryption, to check that the
  * cbc_decrypt buffering works. */
 #define CBC_BULK_DATA 0x2710 /* 10000 */
@@ -161,6 +198,17 @@ test_main(void)
                       "b2eb05e2c39be9fcda6c19078c6a9d1b"),
                  SHEX("000102030405060708090a0b0c0d0e0f"));
 
+  test_cbc_aes128(SHEX("2b7e151628aed2a6abf7158809cf4f3c"),
+                 SHEX("6bc1bee22e409f96e93d7e117393172a"
+                      "ae2d8a571e03ac9c9eb76fac45af8e51"
+                      "30c81c46a35ce411e5fbc1191a0a52ef"
+                      "f69f2445df4f9b17ad2b417be66c3710"),
+                 SHEX("7649abac8119b246cee98e9b12e9197d"
+                      "5086cb9b507219ee95db113a917678b2"
+                      "73bed6b8e3c1743b7116e69e22229516"
+                      "3ff1caa1681fac09120eca307586e1a7"),
+                 SHEX("000102030405060708090a0b0c0d0e0f"));
+
   test_cbc_bulk();
 }
 
diff --git a/x86_64/aesni/aes128-decrypt.asm b/x86_64/aesni/aes128-decrypt.asm
new file mode 100644
index 00000000..79111e47
--- /dev/null
+++ b/x86_64/aesni/aes128-decrypt.asm
@@ -0,0 +1,136 @@
+C x86_64/aesni/aes128-decrypt.asm
+
+ifelse(`
+   Copyright (C) 2015, 2018, 2021 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Input argument
+define(`CTX',  `%rdi')
+define(`LENGTH',`%rsi')
+define(`DST',  `%rdx')
+define(`SRC',  `%rcx')
+
+define(`KEY0', `%xmm0')
+define(`KEY1', `%xmm1')
+define(`KEY2', `%xmm2')
+define(`KEY3', `%xmm3')
+define(`KEY4', `%xmm4')
+define(`KEY5', `%xmm5')
+define(`KEY6', `%xmm6')
+define(`KEY7', `%xmm7')
+define(`KEY8', `%xmm8')
+define(`KEY9', `%xmm9')
+define(`KEY10', `%xmm10')
+define(`X', `%xmm11')
+define(`Y', `%xmm12')
+
+       .file "aes128-decrypt.asm"
+
+       C nettle_aes128_decrypt(const struct aes128_ctx *ctx,
+       C                       size_t length, uint8_t *dst,
+       C                       const uint8_t *src);
+
+       .text
+       ALIGN(16)
+PROLOGUE(nettle_aes128_decrypt)
+       W64_ENTRY(4, 13)
+       shr     $4, LENGTH
+       test    LENGTH, LENGTH
+       jz      .Lend
+
+       movups  (CTX), KEY0
+       movups  16(CTX), KEY1
+       movups  32(CTX), KEY2
+       movups  48(CTX), KEY3
+       movups  64(CTX), KEY4
+       movups  80(CTX), KEY5
+       movups  96(CTX), KEY6
+       movups  112(CTX), KEY7
+       movups  128(CTX), KEY8
+       movups  144(CTX), KEY9
+       movups  160(CTX), KEY10
+       shr     LENGTH
+       jnc     .Lblock_loop
+
+       movups  (SRC), X
+       pxor    KEY0, X
+       aesdec  KEY1, X
+       aesdec  KEY2, X
+       aesdec  KEY3, X
+       aesdec  KEY4, X
+       aesdec  KEY5, X
+       aesdec  KEY6, X
+       aesdec  KEY7, X
+       aesdec  KEY8, X
+       aesdec  KEY9, X
+       aesdeclast KEY10, X
+
+       movups  X, (DST)
+       add     $16, SRC
+       add     $16, DST
+       test    LENGTH, LENGTH
+       jz      .Lend
+
+.Lblock_loop:
+       movups  (SRC), X
+       movups  16(SRC), Y
+       pxor    KEY0, X
+       pxor    KEY0, Y
+       aesdec  KEY1, X
+       aesdec  KEY1, Y
+       aesdec  KEY2, X
+       aesdec  KEY2, Y
+       aesdec  KEY3, X
+       aesdec  KEY3, Y
+       aesdec  KEY4, X
+       aesdec  KEY4, Y
+       aesdec  KEY5, X
+       aesdec  KEY5, Y
+       aesdec  KEY6, X
+       aesdec  KEY6, Y
+       aesdec  KEY7, X
+       aesdec  KEY7, Y
+       aesdec  KEY8, X
+       aesdec  KEY8, Y
+       aesdec  KEY9, X
+       aesdec  KEY9, Y
+       aesdeclast KEY10, X
+       aesdeclast KEY10, Y
+
+       movups  X, (DST)
+       movups  Y, 16(DST)
+       add     $32, SRC
+       add     $32, DST
+       dec     LENGTH
+       jnz     .Lblock_loop
+
+.Lend:
+       W64_EXIT(4, 13)
+       ret
+EPILOGUE(nettle_aes128_decrypt)
diff --git a/x86_64/aesni/aes128-encrypt.asm b/x86_64/aesni/aes128-encrypt.asm
new file mode 100644
index 00000000..8e7ebe78
--- /dev/null
+++ b/x86_64/aesni/aes128-encrypt.asm
@@ -0,0 +1,136 @@
+C x86_64/aesni/aes128-encrypt.asm
+
+ifelse(`
+   Copyright (C) 2015, 2018, 2021 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Input argument
+define(`CTX',  `%rdi')
+define(`LENGTH',`%rsi')
+define(`DST',  `%rdx')
+define(`SRC',  `%rcx')
+
+define(`KEY0', `%xmm0')
+define(`KEY1', `%xmm1')
+define(`KEY2', `%xmm2')
+define(`KEY3', `%xmm3')
+define(`KEY4', `%xmm4')
+define(`KEY5', `%xmm5')
+define(`KEY6', `%xmm6')
+define(`KEY7', `%xmm7')
+define(`KEY8', `%xmm8')
+define(`KEY9', `%xmm9')
+define(`KEY10', `%xmm10')
+define(`X', `%xmm11')
+define(`Y', `%xmm12')
+
+       .file "aes128-encrypt.asm"
+
+       C nettle_aes128_encrypt(const struct aes128_ctx *ctx,
+       C                       size_t length, uint8_t *dst,
+       C                       const uint8_t *src);
+
+       .text
+       ALIGN(16)
+PROLOGUE(nettle_aes128_encrypt)
+       W64_ENTRY(4, 13)
+       shr     $4, LENGTH
+       test    LENGTH, LENGTH
+       jz      .Lend
+
+       movups  (CTX), KEY0
+       movups  16(CTX), KEY1
+       movups  32(CTX), KEY2
+       movups  48(CTX), KEY3
+       movups  64(CTX), KEY4
+       movups  80(CTX), KEY5
+       movups  96(CTX), KEY6
+       movups  112(CTX), KEY7
+       movups  128(CTX), KEY8
+       movups  144(CTX), KEY9
+       movups  160(CTX), KEY10
+       shr     LENGTH
+       jnc     .Lblock_loop
+
+       movups  (SRC), X
+       pxor    KEY0, X
+       aesenc  KEY1, X
+       aesenc  KEY2, X
+       aesenc  KEY3, X
+       aesenc  KEY4, X
+       aesenc  KEY5, X
+       aesenc  KEY6, X
+       aesenc  KEY7, X
+       aesenc  KEY8, X
+       aesenc  KEY9, X
+       aesenclast KEY10, X
+
+       movups  X, (DST)
+       add     $16, SRC
+       add     $16, DST
+       test    LENGTH, LENGTH
+       jz      .Lend
+
+.Lblock_loop:
+       movups  (SRC), X
+       movups  16(SRC), Y
+       pxor    KEY0, X
+       pxor    KEY0, Y
+       aesenc  KEY1, X
+       aesenc  KEY1, Y
+       aesenc  KEY2, X
+       aesenc  KEY2, Y
+       aesenc  KEY3, X
+       aesenc  KEY3, Y
+       aesenc  KEY4, X
+       aesenc  KEY4, Y
+       aesenc  KEY5, X
+       aesenc  KEY5, Y
+       aesenc  KEY6, X
+       aesenc  KEY6, Y
+       aesenc  KEY7, X
+       aesenc  KEY7, Y
+       aesenc  KEY8, X
+       aesenc  KEY8, Y
+       aesenc  KEY9, X
+       aesenc  KEY9, Y
+       aesenclast KEY10, X
+       aesenclast KEY10, Y
+
+       movups  X, (DST)
+       movups  Y, 16(DST)
+       add     $32, SRC
+       add     $32, DST
+       dec     LENGTH
+       jnz     .Lblock_loop
+
+.Lend:
+       W64_EXIT(4, 13)
+       ret
+EPILOGUE(nettle_aes128_encrypt)
diff --git a/x86_64/aesni/cbc-aes128-encrypt.asm 
b/x86_64/aesni/cbc-aes128-encrypt.asm
new file mode 100644
index 00000000..04c6c6b0
--- /dev/null
+++ b/x86_64/aesni/cbc-aes128-encrypt.asm
@@ -0,0 +1,108 @@
+C x86_64/aesni/cbc-aes128-encrypt.asm
+
+ifelse(`
+   Copyright (C) 2015, 2018, 2021 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Input argument
+define(`CTX',  `%rdi')
+define(`LENGTH',`%rsi')
+define(`DST',  `%rdx')
+define(`SRC',  `%rcx')
+
+define(`KEY0', `%xmm0')
+define(`KEY1', `%xmm1')
+define(`KEY2', `%xmm2')
+define(`KEY3', `%xmm3')
+define(`KEY4', `%xmm4')
+define(`KEY5', `%xmm5')
+define(`KEY6', `%xmm6')
+define(`KEY7', `%xmm7')
+define(`KEY8', `%xmm8')
+define(`KEY9', `%xmm9')
+define(`KEY10', `%xmm10')
+define(`X', `%xmm11')
+define(`BLOCK', `%xmm12')
+
+       .file "cbc-aes128-encrypt.asm"
+
+       C nettle_cbc_aes128_encrypt(struct cbc_aes128_ctx *ctx,
+       C                       size_t length, uint8_t *dst,
+       C                       const uint8_t *src);
+
+       .text
+       ALIGN(16)
+PROLOGUE(nettle_cbc_aes128_encrypt)
+       W64_ENTRY(4, 13)
+       shr     $4, LENGTH
+       test    LENGTH, LENGTH
+       jz      .Lend
+
+       movups  (CTX), KEY0
+       movups  16(CTX), KEY1
+       movups  32(CTX), KEY2
+       movups  48(CTX), KEY3
+       movups  64(CTX), KEY4
+       movups  80(CTX), KEY5
+       movups  96(CTX), KEY6
+       movups  112(CTX), KEY7
+       movups  128(CTX), KEY8
+       movups  144(CTX), KEY9
+       movups  160(CTX), KEY10
+       movups  176(CTX), X     C Load IV
+
+.Lblock_loop:
+       movups  (SRC), BLOCK    C Cleartext block
+       pxor    BLOCK, X
+       pxor    KEY0, X
+       aesenc  KEY1, X
+       aesenc  KEY2, X
+       aesenc  KEY3, X
+       aesenc  KEY4, X
+       aesenc  KEY5, X
+       aesenc  KEY6, X
+       aesenc  KEY7, X
+       aesenc  KEY8, X
+       aesenc  KEY9, X
+       aesenclast KEY10, X
+
+       movups  X, (DST)
+       add     $16, SRC
+       add     $16, DST
+
+       dec     LENGTH
+       jnz     .Lblock_loop
+
+       C Save IV
+       movups  X, 176(CTX)
+
+.Lend:
+       W64_EXIT(4, 13)
+       ret
+EPILOGUE(nettle_cbc_aes128_encrypt)

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

Reply via email to