Module Name: src Committed By: riastradh Date: Mon Jun 29 23:51:35 UTC 2020
Modified Files: src/sys/arch/x86/conf: files.x86 src/sys/arch/x86/x86: identcpu.c src/sys/crypto/aes/arch/x86: immintrin.h Added Files: src/sys/crypto/aes/arch/x86: aes_ssse3.c aes_ssse3.h aes_ssse3_impl.c aes_ssse3_impl.h aes_ssse3_subr.c files.aesssse3 Log Message: New permutation-based AES implementation using SSSE3. This covers a lot of CPUs -- particularly lower-end CPUs over the past decade which lack AES-NI. Derived from Mike Hamburg's public domain vpaes software; see <https://crypto.stanford.edu/vpaes/> for details. To generate a diff of this commit: cvs rdiff -u -r1.115 -r1.116 src/sys/arch/x86/conf/files.x86 cvs rdiff -u -r1.110 -r1.111 src/sys/arch/x86/x86/identcpu.c cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/x86/aes_ssse3.c \ src/sys/crypto/aes/arch/x86/aes_ssse3.h \ src/sys/crypto/aes/arch/x86/aes_ssse3_impl.c \ src/sys/crypto/aes/arch/x86/aes_ssse3_impl.h \ src/sys/crypto/aes/arch/x86/aes_ssse3_subr.c \ src/sys/crypto/aes/arch/x86/files.aesssse3 cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/x86/immintrin.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/x86/conf/files.x86 diff -u src/sys/arch/x86/conf/files.x86:1.115 src/sys/arch/x86/conf/files.x86:1.116 --- src/sys/arch/x86/conf/files.x86:1.115 Mon Jun 29 23:47:54 2020 +++ src/sys/arch/x86/conf/files.x86 Mon Jun 29 23:51:35 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.x86,v 1.115 2020/06/29 23:47:54 riastradh Exp $ +# $NetBSD: files.x86,v 1.116 2020/06/29 23:51:35 riastradh Exp $ # options for MP configuration through the MP spec defflag opt_mpbios.h MPBIOS MPDEBUG MPBIOS_SCANPCI @@ -174,3 +174,6 @@ include "crypto/aes/arch/x86/files.aesvi # Bitsliced AES with SSE2 include "crypto/aes/arch/x86/files.aessse2" + +# Permutation-based AES with PSHUFB +include "crypto/aes/arch/x86/files.aesssse3" Index: src/sys/arch/x86/x86/identcpu.c diff -u src/sys/arch/x86/x86/identcpu.c:1.110 src/sys/arch/x86/x86/identcpu.c:1.111 --- src/sys/arch/x86/x86/identcpu.c:1.110 Mon Jun 29 23:47:54 2020 +++ src/sys/arch/x86/x86/identcpu.c Mon Jun 29 23:51:35 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: identcpu.c,v 1.110 2020/06/29 23:47:54 riastradh Exp $ */ +/* $NetBSD: identcpu.c,v 1.111 2020/06/29 23:51:35 riastradh Exp $ */ /*- * Copyright (c) 1999, 2000, 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.110 2020/06/29 23:47:54 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.111 2020/06/29 23:51:35 riastradh Exp $"); #include "opt_xen.h" @@ -41,6 +41,7 @@ __KERNEL_RCSID(0, "$NetBSD: identcpu.c,v #include <crypto/aes/arch/x86/aes_ni.h> #include <crypto/aes/arch/x86/aes_sse2.h> +#include <crypto/aes/arch/x86/aes_ssse3.h> #include <crypto/aes/arch/x86/aes_via.h> #include <uvm/uvm_extern.h> @@ -1006,6 +1007,10 @@ cpu_probe(struct cpu_info *ci) #endif if (cpu_feature[4] & CPUID_VIA_HAS_ACE) aes_md_init(&aes_via_impl); + else if (i386_has_sse && i386_has_sse2 && + (cpu_feature[1] & CPUID2_SSE3) && + (cpu_feature[1] & CPUID2_SSSE3)) + aes_md_init(&aes_ssse3_impl); else if (i386_has_sse && i386_has_sse2) aes_md_init(&aes_sse2_impl); } else { Index: src/sys/crypto/aes/arch/x86/immintrin.h diff -u src/sys/crypto/aes/arch/x86/immintrin.h:1.1 src/sys/crypto/aes/arch/x86/immintrin.h:1.2 --- src/sys/crypto/aes/arch/x86/immintrin.h:1.1 Mon Jun 29 23:47:54 2020 +++ src/sys/crypto/aes/arch/x86/immintrin.h Mon Jun 29 23:51:35 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: immintrin.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $ */ +/* $NetBSD: immintrin.h,v 1.2 2020/06/29 23:51:35 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -53,6 +53,7 @@ typedef unsigned long long __v2du __attr typedef int __v4si __attribute__((__vector_size__(16))); typedef float __v4sf __attribute__((__vector_size__(16))); typedef short __v8hi __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); #elif defined(__clang__) @@ -66,6 +67,7 @@ typedef unsigned long long __v2du __attr typedef int __v4si __attribute__((__vector_size__(16))); typedef float __v4sf __attribute__((__vector_size__(16))); typedef short __v8hi __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); #define _INTRINSATTR \ __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ @@ -79,6 +81,18 @@ typedef short __v8hi __attribute__((__ve #endif +#define _SSSE3_ATTR __attribute__((target("ssse3"))) + +#if defined(__GNUC__) && !defined(__clang__) +#define _mm_alignr_epi8(hi,lo,bytes) \ + (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi), \ + (__v2di)(__m128i)(lo), 8*(int)(bytes)) +#elif defined(__clang__) +#define _mm_alignr_epi8(hi,lo,bytes) \ + (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi), \ + (__v16qi)(__m128i)(lo), (int)(bytes)) +#endif + _INTRINSATTR static __inline __m128i _mm_loadu_si32(const void *__p) @@ -97,6 +111,25 @@ _mm_loadu_si64(const void *__p) _INTRINSATTR static __inline __m128i +_mm_load_si128(const __m128i *__p) +{ + return *__p; +} + +_INTRINSATTR _SSSE3_ATTR +static __inline __m128 +_mm_movehl_ps(__m128 __v0, __m128 __v1) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1); +#elif defined(__clang__) + return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, + 6, 7, 2, 3); +#endif +} + +_INTRINSATTR +static __inline __m128i _mm_set1_epi16(int16_t __v) { return __extension__ (__m128i)(__v8hi){ @@ -133,12 +166,27 @@ _mm_set_epi64x(int64_t __v1, int64_t __v } _INTRINSATTR +static __inline __m128 +_mm_setzero_ps(void) +{ + return __extension__ (__m128){ 0, 0, 0, 0 }; +} + +_INTRINSATTR static __inline __m128i _mm_setzero_si128(void) { return _mm_set1_epi64x(0); } +_INTRINSATTR _SSSE3_ATTR +static __inline __m128i +_mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx) +{ + return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl, + (__v16qi)__vidx); +} + #define _mm_shuffle_epi32(v,m) \ (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m)) @@ -165,6 +213,13 @@ _mm_slli_epi64(__m128i __v, uint8_t __bi _INTRINSATTR static __inline __m128i +_mm_srli_epi32(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits); +} + +_INTRINSATTR +static __inline __m128i _mm_srli_epi64(__m128i __v, uint8_t __bits) { return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits); @@ -194,6 +249,13 @@ _mm_storeu_si64(void *__p, __m128i __v) } _INTRINSATTR +static __inline void +_mm_store_si128(__m128i *__p, __m128i __v) +{ + *__p = __v; +} + +_INTRINSATTR static __inline __m128i _mm_sub_epi64(__m128i __x, __m128i __y) { Added files: Index: src/sys/crypto/aes/arch/x86/aes_ssse3.c diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_ssse3.c:1.1 --- /dev/null Mon Jun 29 23:51:35 2020 +++ src/sys/crypto/aes/arch/x86/aes_ssse3.c Mon Jun 29 23:51:35 2020 @@ -0,0 +1,556 @@ +/* $NetBSD: aes_ssse3.c,v 1.1 2020/06/29 23:51:35 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Permutation-based AES using SSSE3, derived from Mike Hamburg's VPAES + * software, at <https://crypto.stanford.edu/vpaes/>, described in + * + * Mike Hamburg, `Accelerating AES with Vector Permute + * Instructions', in Christophe Clavier and Kris Gaj (eds.), + * Cryptographic Hardware and Embedded Systems -- CHES 2009, + * Springer LNCS 5747, pp. 18-32. + * + * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2 + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_ssse3.c,v 1.1 2020/06/29 23:51:35 riastradh Exp $"); + +#include <sys/types.h> + +#include <sys/systm.h> + +#include "aes_ssse3_impl.h" + +static const union m128const { + uint64_t u64[2]; + __m128i m; +} +mc_forward[4] = { + {.u64 = {0x0407060500030201, 0x0C0F0E0D080B0A09}}, + {.u64 = {0x080B0A0904070605, 0x000302010C0F0E0D}}, + {.u64 = {0x0C0F0E0D080B0A09, 0x0407060500030201}}, + {.u64 = {0x000302010C0F0E0D, 0x080B0A0904070605}}, +}, +mc_backward[4] = { + {.u64 = {0x0605040702010003, 0x0E0D0C0F0A09080B}}, + {.u64 = {0x020100030E0D0C0F, 0x0A09080B06050407}}, + {.u64 = {0x0E0D0C0F0A09080B, 0x0605040702010003}}, + {.u64 = {0x0A09080B06050407, 0x020100030E0D0C0F}}, +}, +ipt[2] = { + {.u64 = {0xC2B2E8985A2A7000, 0xCABAE09052227808}}, + {.u64 = {0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81}}, +}, +opt[2] = { + {.u64 = {0xFF9F4929D6B66000, 0xF7974121DEBE6808}}, + {.u64 = {0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0}}, +}, +dipt[2] = { + {.u64 = {0x0F505B040B545F00, 0x154A411E114E451A}}, + {.u64 = {0x86E383E660056500, 0x12771772F491F194}}, +}, +sb1[2] = { + {.u64 = {0xB19BE18FCB503E00, 0xA5DF7A6E142AF544}}, + {.u64 = {0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF}}, +}, +sb2[2] = { + {.u64 = {0xE27A93C60B712400, 0x5EB7E955BC982FCD}}, + {.u64 = {0x69EB88400AE12900, 0xC2A163C8AB82234A}}, +}, +sbo[2] = { + {.u64 = {0xD0D26D176FBDC700, 0x15AABF7AC502A878}}, + {.u64 = {0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA}}, +}, +dsb9[2] = { + {.u64 = {0x851C03539A86D600, 0xCAD51F504F994CC9}}, + {.u64 = {0xC03B1789ECD74900, 0x725E2C9EB2FBA565}}, +}, +dsbd[2] = { + {.u64 = {0x7D57CCDFE6B1A200, 0xF56E9B13882A4439}}, + {.u64 = {0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3}}, +}, +dsbb[2] = { + {.u64 = {0xD022649296B44200, 0x602646F6B0F2D404}}, + {.u64 = {0xC19498A6CD596700, 0xF3FF0C3E3255AA6B}}, +}, +dsbe[2] = { + {.u64 = {0x46F2929626D4D000, 0x2242600464B4F6B0}}, + {.u64 = {0x0C55A6CDFFAAC100, 0x9467F36B98593E32}}, +}, +dsbo[2] = { + {.u64 = {0x1387EA537EF94000, 0xC7AA6DB9D4943E2D}}, + {.u64 = {0x12D7560F93441D00, 0xCA4B8159D8C58E9C}}, +}, +dks1[2] = { + {.u64 = {0xB6116FC87ED9A700, 0x4AED933482255BFC}}, + {.u64 = {0x4576516227143300, 0x8BB89FACE9DAFDCE}}, +}, +dks2[2] = { + {.u64 = {0x27438FEBCCA86400, 0x4622EE8AADC90561}}, + {.u64 = {0x815C13CE4F92DD00, 0x73AEE13CBD602FF2}}, +}, +dks3[2] = { + {.u64 = {0x03C4C50201C6C700, 0xF83F3EF9FA3D3CFB}}, + {.u64 = {0xEE1921D638CFF700, 0xA5526A9D7384BC4B}}, +}, +dks4[2] = { + {.u64 = {0xE3C390B053732000, 0xA080D3F310306343}}, + {.u64 = {0xA0CA214B036982E8, 0x2F45AEC48CE60D67}}, +}, +deskew[2] = { + {.u64 = {0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A}}, + {.u64 = {0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77}}, +}, +sr[4] = { + {.u64 = {0x0706050403020100, 0x0F0E0D0C0B0A0908}}, + {.u64 = {0x030E09040F0A0500, 0x0B06010C07020D08}}, + {.u64 = {0x0F060D040B020900, 0x070E050C030A0108}}, + {.u64 = {0x0B0E0104070A0D00, 0x0306090C0F020508}}, +}, +rcon = {.u64 = {0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81}}, +s63 = {.u64 = {0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B}}, +of = {.u64 = {0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F}}, +inv = {.u64 = {0x0E05060F0D080180, 0x040703090A0B0C02}}, +inva = {.u64 = {0x01040A060F0B0780, 0x030D0E0C02050809}}; + +static inline __m128i +loadroundkey(const uint32_t *rk32) +{ + return _mm_load_si128((const void *)rk32); +} + +static inline void +storeroundkey(uint32_t *rk32, __m128i rk) +{ + _mm_store_si128((void *)rk32, rk); +} + +/* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */ +static inline void +bytes2nybbles(__m128i *restrict lo, __m128i *restrict hi, __m128i x) +{ + + *lo = x & of.m; + *hi = _mm_srli_epi32(x & ~of.m, 4); +} + +/* Given 0p0q0r0s, return 0x0y0z0w where x = a/p, y = a/q, &c. */ +static inline __m128i +gf16_inva(__m128i x) +{ + return _mm_shuffle_epi8(inva.m, x); +} + +/* Given 0p0q0r0s, return 0x0y0z0w where x = 1/p, y = 1/q, &c. */ +static inline __m128i +gf16_inv(__m128i x) +{ + return _mm_shuffle_epi8(inv.m, x); +} + +/* + * t is a pair of maps respectively from low and high nybbles to bytes. + * Apply t the nybbles, and add the results in GF(2). + */ +static __m128i +aes_schedule_transform(__m128i x, const union m128const t[static 2]) +{ + __m128i lo, hi; + + bytes2nybbles(&lo, &hi, x); + return _mm_shuffle_epi8(t[0].m, lo) ^ _mm_shuffle_epi8(t[1].m, hi); +} + +static inline void +subbytes(__m128i *io, __m128i *jo, __m128i x) +{ + __m128i k, i, ak, j; + + bytes2nybbles(&k, &i, x); + ak = gf16_inva(k); + j = i ^ k; + *io = j ^ gf16_inv(ak ^ gf16_inv(i)); + *jo = i ^ gf16_inv(ak ^ gf16_inv(j)); +} + +static __m128i +aes_schedule_low_round(__m128i rk, __m128i prk) +{ + __m128i io, jo; + + /* smear prk */ + prk ^= _mm_slli_si128(prk, 4); + prk ^= _mm_slli_si128(prk, 8); + prk ^= s63.m; + + /* subbytes */ + subbytes(&io, &jo, rk); + rk = _mm_shuffle_epi8(sb1[0].m, io) ^ _mm_shuffle_epi8(sb1[1].m, jo); + + /* add in smeared stuff */ + return rk ^ prk; +} + +static __m128i +aes_schedule_round(__m128i rk, __m128i prk, __m128i *rcon_rot) +{ + + /* extract rcon from rcon_rot */ + prk ^= _mm_alignr_epi8(_mm_setzero_si128(), *rcon_rot, 15); + *rcon_rot = _mm_alignr_epi8(*rcon_rot, *rcon_rot, 15); + + /* rotate */ + rk = _mm_shuffle_epi32(rk, 0xff); + rk = _mm_alignr_epi8(rk, rk, 1); + + return aes_schedule_low_round(rk, prk); +} + +static __m128i +aes_schedule_mangle_enc(__m128i x, __m128i sr_i) +{ + __m128i y = _mm_setzero_si128(); + + x ^= s63.m; + + x = _mm_shuffle_epi8(x, mc_forward[0].m); + y ^= x; + x = _mm_shuffle_epi8(x, mc_forward[0].m); + y ^= x; + x = _mm_shuffle_epi8(x, mc_forward[0].m); + y ^= x; + + return _mm_shuffle_epi8(y, sr_i); +} + +static __m128i +aes_schedule_mangle_last_enc(__m128i x, __m128i sr_i) +{ + + return aes_schedule_transform(_mm_shuffle_epi8(x, sr_i) ^ s63.m, opt); +} + +static __m128i +aes_schedule_mangle_dec(__m128i x, __m128i sr_i) +{ + __m128i y = _mm_setzero_si128(); + + x = aes_schedule_transform(x, dks1); + y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m); + x = aes_schedule_transform(x, dks2); + y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m); + x = aes_schedule_transform(x, dks3); + y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m); + x = aes_schedule_transform(x, dks4); + y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m); + + return _mm_shuffle_epi8(y, sr_i); +} + +static __m128i +aes_schedule_mangle_last_dec(__m128i x) +{ + + return aes_schedule_transform(x ^ s63.m, deskew); +} + +static __m128i +aes_schedule_192_smear(__m128i prkhi, __m128i prk) +{ + __m128i rk; + + rk = prkhi; + rk ^= _mm_shuffle_epi32(prkhi, 0x80); + rk ^= _mm_shuffle_epi32(prk, 0xfe); + + return rk; +} + +static __m128i +aes_schedule_192_smearhi(__m128i rk) +{ + return (__m128i)_mm_movehl_ps((__m128)rk, _mm_setzero_ps()); +} + +void +aes_ssse3_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds) +{ + uint32_t *rk32 = enc->aese_aes.aes_rk; + __m128i mrk; /* mangled round key */ + __m128i rk; /* round key */ + __m128i prk; /* previous round key */ + __m128i rcon_rot = rcon.m; + uint64_t i = 3; + + /* input transform */ + rk = aes_schedule_transform(_mm_loadu_epi8(key), ipt); + storeroundkey(rk32, rk); + rk32 += 4; + + switch (nrounds) { + case 10: + for (;;) { + rk = aes_schedule_round(rk, rk, &rcon_rot); + if (--nrounds == 0) + break; + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 += 4; + } + break; + case 12: { + __m128i prkhi; /* high half of previous round key */ + + prk = rk; + rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt); + prkhi = aes_schedule_192_smearhi(rk); + for (;;) { + prk = aes_schedule_round(rk, prk, &rcon_rot); + rk = _mm_alignr_epi8(prk, prkhi, 8); + + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 += 4; + rk = aes_schedule_192_smear(prkhi, prk); + prkhi = aes_schedule_192_smearhi(rk); + + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 += 4; + rk = prk = aes_schedule_round(rk, prk, &rcon_rot); + if ((nrounds -= 3) == 0) + break; + + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 += 4; + rk = aes_schedule_192_smear(prkhi, prk); + prkhi = aes_schedule_192_smearhi(rk); + } + break; + } + case 14: { + __m128i pprk; /* previous previous round key */ + + prk = rk; + rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt); + for (;;) { + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 += 4; + pprk = rk; + + /* high round */ + rk = prk = aes_schedule_round(rk, prk, &rcon_rot); + if ((nrounds -= 2) == 0) + break; + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 += 4; + + /* low round */ + rk = _mm_shuffle_epi32(rk, 0xff); + rk = aes_schedule_low_round(rk, pprk); + } + break; + } + default: + panic("invalid number of AES rounds: %u", nrounds); + } + storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4].m)); +} + +void +aes_ssse3_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds) +{ + uint32_t *rk32 = dec->aesd_aes.aes_rk; + __m128i mrk; /* mangled round key */ + __m128i ork; /* original round key */ + __m128i rk; /* round key */ + __m128i prk; /* previous round key */ + __m128i rcon_rot = rcon.m; + unsigned i = nrounds == 12 ? 0 : 2; + + ork = _mm_loadu_epi8(key); + + /* input transform */ + rk = aes_schedule_transform(ork, ipt); + + /* go from end */ + rk32 += 4*nrounds; + storeroundkey(rk32, _mm_shuffle_epi8(ork, sr[i].m)); + rk32 -= 4; + i ^= 3; + + switch (nrounds) { + case 10: + for (;;) { + rk = aes_schedule_round(rk, rk, &rcon_rot); + if (--nrounds == 0) + break; + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 -= 4; + } + break; + case 12: { + __m128i prkhi; /* high half of previous round key */ + + prk = rk; + rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt); + prkhi = aes_schedule_192_smearhi(rk); + for (;;) { + prk = aes_schedule_round(rk, prk, &rcon_rot); + rk = _mm_alignr_epi8(prk, prkhi, 8); + + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 -= 4; + rk = aes_schedule_192_smear(prkhi, prk); + prkhi = aes_schedule_192_smearhi(rk); + + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 -= 4; + rk = prk = aes_schedule_round(rk, prk, &rcon_rot); + if ((nrounds -= 3) == 0) + break; + + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 -= 4; + rk = aes_schedule_192_smear(prkhi, prk); + prkhi = aes_schedule_192_smearhi(rk); + } + break; + } + case 14: { + __m128i pprk; /* previous previous round key */ + + prk = rk; + rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt); + for (;;) { + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 -= 4; + pprk = rk; + + /* high round */ + rk = prk = aes_schedule_round(rk, prk, &rcon_rot); + if ((nrounds -= 2) == 0) + break; + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m); + storeroundkey(rk32, mrk); + rk32 -= 4; + + /* low round */ + rk = _mm_shuffle_epi32(rk, 0xff); + rk = aes_schedule_low_round(rk, pprk); + } + break; + } + default: + panic("invalid number of AES rounds: %u", nrounds); + } + storeroundkey(rk32, aes_schedule_mangle_last_dec(rk)); +} + +__m128i +aes_ssse3_enc1(const struct aesenc *enc, __m128i x, unsigned nrounds) +{ + const uint32_t *rk32 = enc->aese_aes.aes_rk; + __m128i io, jo; + unsigned rmod4 = 0; + + x = aes_schedule_transform(x, ipt); + x ^= loadroundkey(rk32); + for (;;) { + __m128i A, A2, A2_B, A2_B_D; + + subbytes(&io, &jo, x); + + rk32 += 4; + rmod4 = (rmod4 + 1) % 4; + if (--nrounds == 0) + break; + + A = _mm_shuffle_epi8(sb1[0].m, io) ^ + _mm_shuffle_epi8(sb1[1].m, jo); + A ^= loadroundkey(rk32); + A2 = _mm_shuffle_epi8(sb2[0].m, io) ^ + _mm_shuffle_epi8(sb2[1].m, jo); + A2_B = A2 ^ _mm_shuffle_epi8(A, mc_forward[rmod4].m); + A2_B_D = A2_B ^ _mm_shuffle_epi8(A, mc_backward[rmod4].m); + x = A2_B_D ^ _mm_shuffle_epi8(A2_B, mc_forward[rmod4].m); + } + x = _mm_shuffle_epi8(sbo[0].m, io) ^ _mm_shuffle_epi8(sbo[1].m, jo); + x ^= loadroundkey(rk32); + return _mm_shuffle_epi8(x, sr[rmod4].m); +} + +__m128i +aes_ssse3_dec1(const struct aesdec *dec, __m128i x, unsigned nrounds) +{ + const uint32_t *rk32 = dec->aesd_aes.aes_rk; + unsigned i = 3 & ~(nrounds - 1); + __m128i io, jo, mc; + + x = aes_schedule_transform(x, dipt); + x ^= loadroundkey(rk32); + rk32 += 4; + + mc = mc_forward[3].m; + for (;;) { + subbytes(&io, &jo, x); + if (--nrounds == 0) + break; + + x = _mm_shuffle_epi8(dsb9[0].m, io) ^ + _mm_shuffle_epi8(dsb9[1].m, jo); + x ^= loadroundkey(rk32); + rk32 += 4; /* next round key */ + + x = _mm_shuffle_epi8(x, mc); + x ^= _mm_shuffle_epi8(dsbd[0].m, io) ^ + _mm_shuffle_epi8(dsbd[1].m, jo); + + x = _mm_shuffle_epi8(x, mc); + x ^= _mm_shuffle_epi8(dsbb[0].m, io) ^ + _mm_shuffle_epi8(dsbb[1].m, jo); + + x = _mm_shuffle_epi8(x, mc); + x ^= _mm_shuffle_epi8(dsbe[0].m, io) ^ + _mm_shuffle_epi8(dsbe[1].m, jo); + + mc = _mm_alignr_epi8(mc, mc, 12); + } + x = _mm_shuffle_epi8(dsbo[0].m, io) ^ _mm_shuffle_epi8(dsbo[1].m, jo); + x ^= loadroundkey(rk32); + return _mm_shuffle_epi8(x, sr[i].m); +} Index: src/sys/crypto/aes/arch/x86/aes_ssse3.h diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_ssse3.h:1.1 --- /dev/null Mon Jun 29 23:51:35 2020 +++ src/sys/crypto/aes/arch/x86/aes_ssse3.h Mon Jun 29 23:51:35 2020 @@ -0,0 +1,62 @@ +/* $NetBSD: aes_ssse3.h,v 1.1 2020/06/29 23:51:35 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_ARCH_X86_AES_SSSE3_H +#define _CRYPTO_AES_ARCH_X86_AES_SSSE3_H + +#include <crypto/aes/aes.h> + +/* + * These functions MUST NOT use any vector registers for parameters or + * results -- the caller is compiled with -mno-sse &c. in the kernel, + * and dynamically turns on the vector unit just before calling them. + * Internal subroutines that use the vector unit for parameters are + * declared in aes_ssse3_impl.h instead. + */ + +void aes_ssse3_setenckey(struct aesenc *, const uint8_t *, unsigned); +void aes_ssse3_setdeckey(struct aesdec *, const uint8_t *, unsigned); + +void aes_ssse3_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], uint32_t); +void aes_ssse3_dec(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], uint32_t); +void aes_ssse3_cbc_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); +void aes_ssse3_cbc_dec(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); +void aes_ssse3_xts_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); +void aes_ssse3_xts_dec(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); + +int aes_ssse3_selftest(void); + +extern struct aes_impl aes_ssse3_impl; + +#endif /* _CRYPTO_AES_ARCH_X86_AES_SSSE3_H */ Index: src/sys/crypto/aes/arch/x86/aes_ssse3_impl.c diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_ssse3_impl.c:1.1 --- /dev/null Mon Jun 29 23:51:35 2020 +++ src/sys/crypto/aes/arch/x86/aes_ssse3_impl.c Mon Jun 29 23:51:35 2020 @@ -0,0 +1,165 @@ +/* $NetBSD: aes_ssse3_impl.c,v 1.1 2020/06/29 23:51:35 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_ssse3_impl.c,v 1.1 2020/06/29 23:51:35 riastradh Exp $"); + +#include <crypto/aes/aes.h> +#include <crypto/aes/arch/x86/aes_ssse3.h> + +#include <x86/cpu.h> +#include <x86/cpuvar.h> +#include <x86/fpu.h> +#include <x86/specialreg.h> + +static void +aes_ssse3_setenckey_impl(struct aesenc *enc, const uint8_t *key, + uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_ssse3_setenckey(enc, key, nrounds); + fpu_kern_leave(); +} + +static void +aes_ssse3_setdeckey_impl(struct aesdec *dec, const uint8_t *key, + uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_ssse3_setdeckey(dec, key, nrounds); + fpu_kern_leave(); +} + +static void +aes_ssse3_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_ssse3_enc(enc, in, out, nrounds); + fpu_kern_leave(); +} + +static void +aes_ssse3_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_ssse3_dec(dec, in, out, nrounds); + fpu_kern_leave(); +} + +static void +aes_ssse3_cbc_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_ssse3_cbc_enc(enc, in, out, nbytes, iv, nrounds); + fpu_kern_leave(); +} + +static void +aes_ssse3_cbc_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_ssse3_cbc_dec(dec, in, out, nbytes, iv, nrounds); + fpu_kern_leave(); +} + +static void +aes_ssse3_xts_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_ssse3_xts_enc(enc, in, out, nbytes, iv, nrounds); + fpu_kern_leave(); +} + +static void +aes_ssse3_xts_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_ssse3_xts_dec(dec, in, out, nbytes, iv, nrounds); + fpu_kern_leave(); +} + +static int +aes_ssse3_probe(void) +{ + int result = 0; + + /* Verify that the CPU supports SSE, SSE2, SSE3, and SSSE3. */ + if (!i386_has_sse) + return -1; + if (!i386_has_sse2) + return -1; + if (((cpu_feature[1]) & CPUID2_SSE3) == 0) + return -1; + if (((cpu_feature[1]) & CPUID2_SSSE3) == 0) + return -1; + + fpu_kern_enter(); + result = aes_ssse3_selftest(); + fpu_kern_leave(); + + return result; +} + +struct aes_impl aes_ssse3_impl = { + .ai_name = "Intel SSSE3 vpaes", + .ai_probe = aes_ssse3_probe, + .ai_setenckey = aes_ssse3_setenckey_impl, + .ai_setdeckey = aes_ssse3_setdeckey_impl, + .ai_enc = aes_ssse3_enc_impl, + .ai_dec = aes_ssse3_dec_impl, + .ai_cbc_enc = aes_ssse3_cbc_enc_impl, + .ai_cbc_dec = aes_ssse3_cbc_dec_impl, + .ai_xts_enc = aes_ssse3_xts_enc_impl, + .ai_xts_dec = aes_ssse3_xts_dec_impl, +}; Index: src/sys/crypto/aes/arch/x86/aes_ssse3_impl.h diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_ssse3_impl.h:1.1 --- /dev/null Mon Jun 29 23:51:35 2020 +++ src/sys/crypto/aes/arch/x86/aes_ssse3_impl.h Mon Jun 29 23:51:35 2020 @@ -0,0 +1,42 @@ +/* $NetBSD: aes_ssse3_impl.h,v 1.1 2020/06/29 23:51:35 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_ARCH_X86_AES_SSSE3_IMPL_H +#define _CRYPTO_AES_ARCH_X86_AES_SSSE3_IMPL_H + +#include <sys/types.h> + +#include <crypto/aes/aes.h> +#include <crypto/aes/arch/x86/aes_ssse3.h> +#include <crypto/aes/arch/x86/immintrin.h> +#include <crypto/aes/arch/x86/immintrin_ext.h> + +__m128i aes_ssse3_enc1(const struct aesenc *, __m128i, unsigned); +__m128i aes_ssse3_dec1(const struct aesdec *, __m128i, unsigned); + +#endif /* _CRYPTO_AES_ARCH_X86_AES_SSSE3_IMPL_H */ Index: src/sys/crypto/aes/arch/x86/aes_ssse3_subr.c diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_ssse3_subr.c:1.1 --- /dev/null Mon Jun 29 23:51:35 2020 +++ src/sys/crypto/aes/arch/x86/aes_ssse3_subr.c Mon Jun 29 23:51:35 2020 @@ -0,0 +1,213 @@ +/* $NetBSD: aes_ssse3_subr.c,v 1.1 2020/06/29 23:51:35 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_ssse3_subr.c,v 1.1 2020/06/29 23:51:35 riastradh Exp $"); + +#include <sys/systm.h> + +#include <lib/libkern/libkern.h> + +#include "aes_ssse3_impl.h" + +static inline __m128i +loadblock(const void *in) +{ + return _mm_loadu_epi8(in); +} + +static inline void +storeblock(void *out, __m128i block) +{ + _mm_storeu_epi8(out, block); +} + +void +aes_ssse3_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + __m128i block; + + block = loadblock(in); + block = aes_ssse3_enc1(enc, block, nrounds); + storeblock(out, block); +} + +void +aes_ssse3_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + __m128i block; + + block = loadblock(in); + block = aes_ssse3_dec1(dec, block, nrounds); + storeblock(out, block); +} + +void +aes_ssse3_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + __m128i cv; + + KASSERT(nbytes); + + cv = loadblock(iv); + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + cv ^= loadblock(in); + cv = aes_ssse3_enc1(enc, cv, nrounds); + storeblock(out, cv); + } + storeblock(iv, cv); +} + +void +aes_ssse3_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + __m128i iv0, cv, b; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + iv0 = loadblock(iv); + cv = loadblock(in + nbytes - 16); + storeblock(iv, cv); + + for (;;) { + b = aes_ssse3_dec1(dec, cv, nrounds); + if ((nbytes -= 16) == 0) + break; + cv = loadblock(in + nbytes - 16); + storeblock(out + nbytes, b ^ cv); + } + storeblock(out, b ^ iv0); +} + +static inline __m128i +aes_ssse3_xts_update(__m128i t) +{ + const __m128i one = _mm_set_epi64x(1, 1); + __m128i s, m, c; + + s = _mm_srli_epi64(t, 63); /* 1 if high bit set else 0 */ + m = _mm_sub_epi64(s, one); /* 0 if high bit set else -1 */ + m = _mm_shuffle_epi32(m, 0x4e); /* swap halves */ + c = _mm_set_epi64x(1, 0x87); /* carry */ + + return _mm_slli_epi64(t, 1) ^ (c & ~m); +} + +static int +aes_ssse3_xts_update_selftest(void) +{ + static const struct { + uint32_t in[4], out[4]; + } cases[] = { + [0] = { {1}, {2} }, + [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, + [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, + [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, + [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, + [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, + }; + unsigned i; + uint32_t t[4]; + int result = 0; + + for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { + t[0] = cases[i].in[0]; + t[1] = cases[i].in[1]; + t[2] = cases[i].in[2]; + t[3] = cases[i].in[3]; + storeblock(t, aes_ssse3_xts_update(loadblock(t))); + if (t[0] != cases[i].out[0] || + t[1] != cases[i].out[1] || + t[2] != cases[i].out[2] || + t[3] != cases[i].out[3]) { + printf("%s %u:" + " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", + __func__, i, t[0], t[1], t[2], t[3]); + result = -1; + } + } + + return result; +} + +void +aes_ssse3_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + __m128i t, b; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + t = loadblock(tweak); + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + b = t ^ loadblock(in); + b = aes_ssse3_enc1(enc, b, nrounds); + storeblock(out, t ^ b); + t = aes_ssse3_xts_update(t); + } + storeblock(tweak, t); +} + +void +aes_ssse3_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + __m128i t, b; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + t = loadblock(tweak); + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + b = t ^ loadblock(in); + b = aes_ssse3_dec1(dec, b, nrounds); + storeblock(out, t ^ b); + t = aes_ssse3_xts_update(t); + } + storeblock(tweak, t); +} + +int +aes_ssse3_selftest(void) +{ + + if (aes_ssse3_xts_update_selftest()) + return -1; + + return 0; +} Index: src/sys/crypto/aes/arch/x86/files.aesssse3 diff -u /dev/null src/sys/crypto/aes/arch/x86/files.aesssse3:1.1 --- /dev/null Mon Jun 29 23:51:35 2020 +++ src/sys/crypto/aes/arch/x86/files.aesssse3 Mon Jun 29 23:51:35 2020 @@ -0,0 +1,8 @@ +# $NetBSD: files.aesssse3,v 1.1 2020/06/29 23:51:35 riastradh Exp $ + +makeoptions aes "COPTS.aes_ssse3.c"+="-msse -msse2 -msse3 -mssse3" +makeoptions aes "COPTS.aes_ssse3_subr.c"+="-msse -msse2 -msse3 -mssse3" + +file crypto/aes/arch/x86/aes_ssse3.c aes +file crypto/aes/arch/x86/aes_ssse3_subr.c aes +file crypto/aes/arch/x86/aes_ssse3_impl.c aes