* cipher/Makefile.am: Add 'rijndael-riscv-zvkned.c'. * cipher/rijndael-internal.h (USE_RISCV_V_CRYPTO): New. * cipher/rijndael-riscv-zvkned.c: New. * cipher/rijndael.c [USE_RISCV_V_CRYPTO] (_gcry_aes_riscv_zvkned_setup_acceleration, _gcry_aes_riscv_zvkned_setkey) (_gcry_aes_riscv_zvkned_prepare_decryption) (_gcry_aes_riscv_zvkned_encrypt, _gcry_aes_riscv_zvkned_decrypt) (_gcry_aes_riscv_zvkned_cfb_enc, _gcry_aes_riscv_zvkned_cbc_enc) (_gcry_aes_riscv_zvkned_ctr_enc, _gcry_aes_riscv_zvkned_ctr32le_enc) (_gcry_aes_riscv_zvkned_cfb_dec, _gcry_aes_riscv_zvkned_cbc_dec) (_gcry_aes_riscv_zvkned_ocb_crypt, _gcry_aes_riscv_zvkned_ocb_auth) (_gcry_aes_riscv_zvkned_ecb_crypt, _gcry_aes_riscv_zvkned_xts_crypt): New. (do_setkey) [USE_RISCV_V_CRYPTO]: Add setup for RISC-V vector cryptography extension implementation. * configure.ac: Add 'rijndael-riscv-zvkned.lo'. (GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST): Add AES intrinsics. (gcry_cv_riscv_vaes_vs_intrinsics_work, HAVE_BROKEN_VAES_VS_INTRINSIC): New. * src/g10lib.h (HWF_RISCV_ZVKNED): Insert before HWF_RISCV_ZVKNHA. * src/hwf-riscv.c (HWF_RISCV_HWPROBE_EXT_ZVKNED): New. (hwprobe_features): Add Zvkned. * src/hwfeatures.c (hwflist): Add "riscv-zvkned". --
Implementation has been tested against QEMU emulator as there is no actual HW available with these instructions yet. Signed-off-by: Jussi Kivilinna <jussi.kivili...@iki.fi> --- cipher/Makefile.am | 7 + cipher/rijndael-internal.h | 9 + cipher/rijndael-riscv-zvkned.c | 1608 ++++++++++++++++++++++++++++++++ cipher/rijndael.c | 90 +- configure.ac | 76 ++ src/g10lib.h | 5 +- src/hwf-riscv.c | 2 + src/hwfeatures.c | 1 + 8 files changed, 1793 insertions(+), 5 deletions(-) create mode 100644 cipher/rijndael-riscv-zvkned.c diff --git a/cipher/Makefile.am b/cipher/Makefile.am index ea91b7b8..7abbd5b3 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -122,6 +122,7 @@ EXTRA_libcipher_la_SOURCES = \ rijndael-ppc.c rijndael-ppc9le.c \ rijndael-p10le.c rijndael-gcm-p10le.s \ rijndael-ppc-common.h rijndael-ppc-functions.h \ + rijndael-riscv-zvkned.c \ rijndael-s390x.c \ rijndael-vp-aarch64.c rijndael-vp-riscv.c \ rijndael-vp-simd128.h \ @@ -389,6 +390,12 @@ riscv_vector_crypto_cflags = endif endif +rijndael-riscv-zvkned.o: $(srcdir)/rijndael-riscv-zvkned.c Makefile + `echo $(COMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) ` + +rijndael-riscv-zvkned.lo: $(srcdir)/rijndael-riscv-zvkned.c Makefile + `echo $(LTCOMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) ` + sha256-riscv-zvknha-zvkb.o: $(srcdir)/sha256-riscv-zvknha-zvkb.c Makefile `echo $(COMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) ` diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h index 92310fc5..15084a69 100644 --- a/cipher/rijndael-internal.h +++ b/cipher/rijndael-internal.h @@ -124,6 +124,15 @@ # endif #endif /* ENABLE_ARM_CRYPTO_SUPPORT */ +/* USE_RISCV_V_CRYPTO indicates whether to enable RISC-V vector cryptography + * extension code. */ +#undef USE_RISCV_V_CRYPTO +#if defined (__riscv) && \ + defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \ + defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS) +# define USE_RISCV_V_CRYPTO 1 +#endif + /* USE_VP_AARCH64 indicates whether to enable vector permute AArch64 SIMD code. */ #undef USE_VP_AARCH64 #if defined(__AARCH64EL__) && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) diff --git a/cipher/rijndael-riscv-zvkned.c b/cipher/rijndael-riscv-zvkned.c new file mode 100644 index 00000000..e3ba6769 --- /dev/null +++ b/cipher/rijndael-riscv-zvkned.c @@ -0,0 +1,1608 @@ +/* rijndael-riscv-zvkned.c - RISC-V vector crypto implementation of AES + * Copyright (C) 2025 Jussi Kivilinna <jussi.kivili...@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#if defined (__riscv) && \ + defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \ + defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS) + +#include "g10lib.h" +#include "simd-common-riscv.h" +#include "rijndael-internal.h" +#include "cipher-internal.h" + +#include <riscv_vector.h> + + +#define ALWAYS_INLINE inline __attribute__((always_inline)) +#define NO_INLINE __attribute__((noinline)) +#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function)) + +#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION +#define ASM_FUNC_ATTR_INLINE ALWAYS_INLINE ASM_FUNC_ATTR +#define ASM_FUNC_ATTR_NOINLINE NO_INLINE ASM_FUNC_ATTR + +#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE +# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2"))) +#else +# define FUNC_ATTR_OPT_O2 +#endif + + +/* + * Helper macro and functions + */ + +#define cast_u8m1_u32m1(a) __riscv_vreinterpret_v_u8m1_u32m1(a) +#define cast_u8m1_u64m1(a) __riscv_vreinterpret_v_u8m1_u64m1(a) +#define cast_u32m1_u8m1(a) __riscv_vreinterpret_v_u32m1_u8m1(a) +#define cast_u32m1_u64m1(a) __riscv_vreinterpret_v_u32m1_u64m1(a) +#define cast_u64m1_u8m1(a) __riscv_vreinterpret_v_u64m1_u8m1(a) + +#define cast_u8m2_u32m2(a) __riscv_vreinterpret_v_u8m2_u32m2(a) +#define cast_u32m2_u8m2(a) __riscv_vreinterpret_v_u32m2_u8m2(a) + +#define cast_u8m4_u32m4(a) __riscv_vreinterpret_v_u8m4_u32m4(a) +#define cast_u32m4_u8m4(a) __riscv_vreinterpret_v_u32m4_u8m4(a) + +#define cast_u64m1_u32m1(a) __riscv_vreinterpret_v_u64m1_u32m1(a) +#define cast_u32m1_u64m1(a) __riscv_vreinterpret_v_u32m1_u64m1(a) + +#define cast_u64m1_i64m1(a) __riscv_vreinterpret_v_u64m1_i64m1(a) +#define cast_i64m1_u64m1(a) __riscv_vreinterpret_v_i64m1_u64m1(a) + +#define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory") + + +static ASM_FUNC_ATTR_INLINE vuint32m1_t +bswap128_u32m1(vuint32m1_t vec, size_t vl_u32) +{ + static const byte bswap128_arr[16] = + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + size_t vl_bytes = vl_u32 * 4; + vuint8m1_t bswap128 = __riscv_vle8_v_u8m1(bswap128_arr, vl_bytes); + + return cast_u8m1_u32m1( + __riscv_vrgather_vv_u8m1(cast_u32m1_u8m1(vec), bswap128, vl_bytes)); +} + +static ASM_FUNC_ATTR_INLINE vuint32m1_t +unaligned_load_u32m1(const void *ptr, size_t vl_u32) +{ + size_t vl_bytes = vl_u32 * 4; + + return cast_u8m1_u32m1(__riscv_vle8_v_u8m1(ptr, vl_bytes)); +} + +static ASM_FUNC_ATTR_INLINE void +unaligned_store_u32m1(void *ptr, vuint32m1_t vec, size_t vl_u32) +{ + size_t vl_bytes = vl_u32 * 4; + + __riscv_vse8_v_u8m1(ptr, cast_u32m1_u8m1(vec), vl_bytes); +} + +static ASM_FUNC_ATTR_INLINE vuint32m4_t +unaligned_load_u32m4(const void *ptr, size_t vl_u32) +{ + size_t vl_bytes = vl_u32 * 4; + + return cast_u8m4_u32m4(__riscv_vle8_v_u8m4(ptr, vl_bytes)); +} + +static ASM_FUNC_ATTR_INLINE void +unaligned_store_u32m4(void *ptr, vuint32m4_t vec, size_t vl_u32) +{ + size_t vl_bytes = vl_u32 * 4; + + __riscv_vse8_v_u8m4(ptr, cast_u32m4_u8m4(vec), vl_bytes); +} + +static vuint32m1_t +vxor_u8_u32m1(vuint32m1_t a, vuint32m1_t b, size_t vl_u32) +{ + size_t vl_bytes = vl_u32 * 4; + + return cast_u8m1_u32m1(__riscv_vxor_vv_u8m1(cast_u32m1_u8m1(a), + cast_u32m1_u8m1(b), vl_bytes)); +} + +static vuint32m4_t +vxor_u8_u32m4(vuint32m4_t a, vuint32m4_t b, size_t vl_u32) +{ + size_t vl_bytes = vl_u32 * 4; + + return cast_u8m4_u32m4(__riscv_vxor_vv_u8m4(cast_u32m4_u8m4(a), + cast_u32m4_u8m4(b), vl_bytes)); +} + + +/* + * HW support detection + */ + +int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 +_gcry_aes_riscv_zvkned_setup_acceleration(RIJNDAEL_context *ctx) +{ + (void)ctx; + return (__riscv_vsetvl_e32m1(4) == 4); +} + + +/* + * Key expansion + */ + +static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +aes128_riscv_setkey (RIJNDAEL_context *ctx, const byte *key) +{ + size_t vl = 4; + + vuint32m1_t round_key = unaligned_load_u32m1 (key, vl); + __riscv_vse32_v_u32m1 (&ctx->keyschenc32[0][0], round_key, vl); + + round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 1, vl); + __riscv_vse32_v_u32m1 (&ctx->keyschenc32[1][0], round_key, vl); + + round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 2, vl); + __riscv_vse32_v_u32m1 (&ctx->keyschenc32[2][0], round_key, vl); + + round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 3, vl); + __riscv_vse32_v_u32m1 (&ctx->keyschenc32[3][0], round_key, vl); + + round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 4, vl); + __riscv_vse32_v_u32m1 (&ctx->keyschenc32[4][0], round_key, vl); + + round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 5, vl); + __riscv_vse32_v_u32m1 (&ctx->keyschenc32[5][0], round_key, vl); + + round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 6, vl); + __riscv_vse32_v_u32m1 (&ctx->keyschenc32[6][0], round_key, vl); + + round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 7, vl); + __riscv_vse32_v_u32m1 (&ctx->keyschenc32[7][0], round_key, vl); + + round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 8, vl); + __riscv_vse32_v_u32m1 (&ctx->keyschenc32[8][0], round_key, vl); + + round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 9, vl); + __riscv_vse32_v_u32m1 (&ctx->keyschenc32[9][0], round_key, vl); + + round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 10, vl); + __riscv_vse32_v_u32m1 (&ctx->keyschenc32[10][0], round_key, vl); + + clear_vec_regs(); +} + +static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +aes192_riscv_setkey (RIJNDAEL_context *ctx, const byte *key) +{ + size_t vl = 4; + u32 *w = &ctx->keyschenc32[0][0]; + u32 wr; + vuint32m1_t rk_0_7; + vuint32m1_t rk_4_11; + + rk_0_7 = unaligned_load_u32m1 (&key[0], vl); + rk_4_11 = unaligned_load_u32m1 (&key[8], vl); + __riscv_vse32_v_u32m1 (&w[0], rk_0_7, vl); + __riscv_vse32_v_u32m1 (&w[2], rk_4_11, vl); + +#define AES192_KF1_GEN(out, input, round192, vl) \ + ({ \ + u32 temp_array[4] = { 0, 0, 0, 0 }; \ + vuint32m1_t temp_vec; \ + temp_array[3] = (input); \ + temp_vec = __riscv_vle32_v_u32m1(temp_array, (vl)); \ + temp_vec = __riscv_vaeskf1_vi_u32m1(temp_vec, (round192), (vl)); \ + (out) = __riscv_vmv_x_s_u32m1_u32(temp_vec); \ + }) + +#define AES192_EXPAND_BLOCK(w, round192, wr, last) \ + ({ \ + (w)[(round192) * 6 + 0] = (w)[(round192) * 6 - 6] ^ (wr); \ + (w)[(round192) * 6 + 1] = (w)[(round192) * 6 - 5] ^ (w)[(round192) * 6 + 0]; \ + (w)[(round192) * 6 + 2] = (w)[(round192) * 6 - 4] ^ (w)[(round192) * 6 + 1]; \ + (w)[(round192) * 6 + 3] = (w)[(round192) * 6 - 3] ^ (w)[(round192) * 6 + 2]; \ + if (!(last)) \ + { \ + (w)[(round192) * 6 + 4] = (w)[(round192) * 6 - 2] ^ (w)[(round192) * 6 + 3]; \ + (w)[(round192) * 6 + 5] = (w)[(round192) * 6 - 1] ^ (w)[(round192) * 6 + 4]; \ + } \ + }) + + AES192_KF1_GEN(wr, w[5], 1, vl); + AES192_EXPAND_BLOCK(w, 1, wr, 0); + + AES192_KF1_GEN(wr, w[11], 2, vl); + AES192_EXPAND_BLOCK(w, 2, wr, 0); + + AES192_KF1_GEN(wr, w[17], 3, vl); + AES192_EXPAND_BLOCK(w, 3, wr, 0); + + AES192_KF1_GEN(wr, w[23], 4, vl); + AES192_EXPAND_BLOCK(w, 4, wr, 0); + + AES192_KF1_GEN(wr, w[29], 5, vl); + AES192_EXPAND_BLOCK(w, 5, wr, 0); + + AES192_KF1_GEN(wr, w[35], 6, vl); + AES192_EXPAND_BLOCK(w, 6, wr, 0); + + AES192_KF1_GEN(wr, w[41], 7, vl); + AES192_EXPAND_BLOCK(w, 7, wr, 0); + + AES192_KF1_GEN(wr, w[47], 8, vl); + AES192_EXPAND_BLOCK(w, 8, wr, 1); + +#undef AES192_KF1_GEN +#undef AES192_EXPAND_BLOCK + + clear_vec_regs(); +} + +static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +aes256_riscv_setkey (RIJNDAEL_context *ctx, const byte *key) +{ + size_t vl = 4; + + vuint32m1_t rk_a = unaligned_load_u32m1 (&key[0], vl); + vuint32m1_t rk_b = unaligned_load_u32m1 (&key[16], vl); + + __riscv_vse32_v_u32m1(&ctx->keyschenc32[0][0], rk_a, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[1][0], rk_b, vl); + + rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 2, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[2][0], rk_a, vl); + + rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 3, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[3][0], rk_b, vl); + + rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 4, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[4][0], rk_a, vl); + + rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 5, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[5][0], rk_b, vl); + + rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 6, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[6][0], rk_a, vl); + + rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 7, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[7][0], rk_b, vl); + + rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 8, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[8][0], rk_a, vl); + + rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 9, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[9][0], rk_b, vl); + + rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 10, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[10][0], rk_a, vl); + + rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 11, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[11][0], rk_b, vl); + + rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 12, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[12][0], rk_a, vl); + + rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 13, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[13][0], rk_b, vl); + + rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 14, vl); + __riscv_vse32_v_u32m1(&ctx->keyschenc32[14][0], rk_a, vl); + + clear_vec_regs(); +} + +void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 +_gcry_aes_riscv_zvkned_setkey (RIJNDAEL_context *ctx, const byte *key) +{ + unsigned int rounds = ctx->rounds; + + if (rounds < 12) + { + aes128_riscv_setkey(ctx, key); + } + else if (rounds == 12) + { + aes192_riscv_setkey(ctx, key); + _gcry_burn_stack(64); + } + else + { + aes256_riscv_setkey(ctx, key); + } +} + +static ASM_FUNC_ATTR_INLINE void +do_prepare_decryption(RIJNDAEL_context *ctx) +{ + u32 *ekey = (u32 *)(void *)ctx->keyschenc; + u32 *dkey = (u32 *)(void *)ctx->keyschdec; + int rounds = ctx->rounds; + size_t vl = 4; + int rr; + int r; + + r = 0; + rr = rounds; + for (r = 0, rr = rounds; r <= rounds; r++, rr--) + { + __riscv_vse32_v_u32m1(dkey + r * 4, + __riscv_vle32_v_u32m1(ekey + rr * 4, vl), + vl); + } +} + +void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 +_gcry_aes_riscv_zvkned_prepare_decryption(RIJNDAEL_context *ctx) +{ + do_prepare_decryption(ctx); + clear_vec_regs(); +} + + +/* + * Encryption / Decryption + */ + +#define ROUND_KEY_VARIABLES \ + vuint32m1_t rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7, rk8; \ + vuint32m1_t rk9, rk10, rk11, rk12, rk13, rk_last; + +#define PRELOAD_ROUND_KEYS(rk, nrounds, vl) \ + do { \ + rk0 = __riscv_vle32_v_u32m1(rk + 0 * 4, vl); \ + rk1 = __riscv_vle32_v_u32m1(rk + 1 * 4, vl); \ + rk2 = __riscv_vle32_v_u32m1(rk + 2 * 4, vl); \ + rk3 = __riscv_vle32_v_u32m1(rk + 3 * 4, vl); \ + rk4 = __riscv_vle32_v_u32m1(rk + 4 * 4, vl); \ + rk5 = __riscv_vle32_v_u32m1(rk + 5 * 4, vl); \ + rk6 = __riscv_vle32_v_u32m1(rk + 6 * 4, vl); \ + rk7 = __riscv_vle32_v_u32m1(rk + 7 * 4, vl); \ + rk8 = __riscv_vle32_v_u32m1(rk + 8 * 4, vl); \ + rk9 = __riscv_vle32_v_u32m1(rk + 9 * 4, vl); \ + if (UNLIKELY(nrounds >= 12)) \ + { \ + rk10 = __riscv_vle32_v_u32m1(rk + 10 * 4, vl); \ + rk11 = __riscv_vle32_v_u32m1(rk + 11 * 4, vl); \ + if (LIKELY(nrounds > 12)) \ + { \ + rk12 = __riscv_vle32_v_u32m1(rk + 12 * 4, vl); \ + rk13 = __riscv_vle32_v_u32m1(rk + 13 * 4, vl); \ + } \ + else \ + { \ + rk12 = __riscv_vundefined_u32m1(); \ + rk13 = __riscv_vundefined_u32m1(); \ + } \ + } \ + else \ + { \ + rk10 = __riscv_vundefined_u32m1(); \ + rk11 = __riscv_vundefined_u32m1(); \ + rk12 = __riscv_vundefined_u32m1(); \ + rk13 = __riscv_vundefined_u32m1(); \ + } \ + rk_last = __riscv_vle32_v_u32m1(rk + nrounds * 4, vl); \ + } while (0) + +#ifdef HAVE_BROKEN_VAES_VS_INTRINSIC +#define AES_CRYPT(e_d, mx, nrounds, blk, vlen) \ + asm ( "vsetvli zero,%[vl],e32,"#mx",ta,ma;\n\t" \ + "vaesz.vs %[block],%[rk0];\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk1];\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk2];\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk3];\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk4];\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk5];\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk6];\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk7];\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk8];\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk9];\n\t" \ + "blt %[rounds],%[num12],.Lcryptlast%=;\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk10];\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk11];\n\t" \ + "beq %[rounds],%[num12],.Lcryptlast%=;\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk12];\n\t" \ + "vaes"#e_d"m.vs %[block],%[rk13];\n\t" \ + ".Lcryptlast%=:\n\t" \ + "vaes"#e_d"f.vs %[block],%[rk_last];\n\t" \ + : [block] "+vr" (blk) \ + : [vl] "r" (vlen), [rounds] "r" (nrounds), [num12] "r" (12), \ + [rk0] "vr" (rk0), [rk1] "vr" (rk1), [rk2] "vr" (rk2), \ + [rk3] "vr" (rk3), [rk4] "vr" (rk4), [rk5] "vr" (rk5), \ + [rk6] "vr" (rk6), [rk7] "vr" (rk7), [rk8] "vr" (rk8), \ + [rk9] "vr" (rk9), [rk10] "vr" (rk10), [rk11] "vr" (rk11), \ + [rk12] "vr" (rk12), [rk13] "vr" (rk13), \ + [rk_last] "vr" (rk_last) \ + : "vl") +#else +#define AES_CRYPT(e_d, mx, rounds, block, vl) \ + ({ \ + (block) = __riscv_vaesz_vs_u32m1_u32##mx((block), rk0, (vl)); \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk1, (vl)); \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk2, (vl)); \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk3, (vl)); \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk4, (vl)); \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk5, (vl)); \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk6, (vl)); \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk7, (vl)); \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk8, (vl)); \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk9, (vl)); \ + if (UNLIKELY((rounds) >= 12)) \ + { \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk10, (vl)); \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk11, (vl)); \ + if (LIKELY((rounds) > 12)) \ + { \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk12, (vl)); \ + (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk13, (vl)); \ + } \ + } \ + (block) = __riscv_vaes##e_d##f_vs_u32m1_u32##mx((block), rk_last, (vl)); \ + }) +#endif + +unsigned int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 +_gcry_aes_riscv_zvkned_encrypt (const RIJNDAEL_context *ctx, unsigned char *out, + const unsigned char *in) +{ + const u32 *rk = ctx->keyschenc32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + vuint32m1_t block; + ROUND_KEY_VARIABLES; + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + block = unaligned_load_u32m1(in, vl); + + AES_CRYPT(e, m1, rounds, block, vl); + + unaligned_store_u32m1(out, block, vl); + + clear_vec_regs(); + + return 0; /* does not use stack */ +} + +unsigned int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 +_gcry_aes_riscv_zvkned_decrypt (const RIJNDAEL_context *ctx, unsigned char *out, + const unsigned char *in) +{ + const u32 *rk = ctx->keyschdec32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + vuint32m1_t block; + ROUND_KEY_VARIABLES; + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + block = unaligned_load_u32m1(in, vl); + + AES_CRYPT(d, m1, rounds, block, vl); + + unaligned_store_u32m1(out, block, vl); + + clear_vec_regs(); + + return 0; /* does not use stack */ +} + +static ASM_FUNC_ATTR_INLINE void +aes_riscv_zvkned_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + RIJNDAEL_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + const u32 *rk = encrypt ? ctx->keyschenc32[0] : ctx->keyschdec32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + ROUND_KEY_VARIABLES; + + if (!encrypt && !ctx->decryption_prepared) + { + do_prepare_decryption(ctx); + ctx->decryption_prepared = 1; + } + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + for (; nblocks >= 4; nblocks -= 4) + { + vuint32m4_t blocks; + + blocks = unaligned_load_u32m4(inbuf, vl * 4); + + if (encrypt) + AES_CRYPT(e, m4, rounds, blocks, vl * 4); + else + AES_CRYPT(d, m4, rounds, blocks, vl * 4); + + unaligned_store_u32m4(outbuf, blocks, vl * 4); + + inbuf += 4 * BLOCKSIZE; + outbuf += 4 * BLOCKSIZE; + } + + for (; nblocks; nblocks--) + { + vuint32m1_t block; + + block = unaligned_load_u32m1(inbuf, vl); + + if (encrypt) + AES_CRYPT(e, m1, rounds, block, vl); + else + AES_CRYPT(d, m1, rounds, block, vl); + + unaligned_store_u32m1(outbuf, block, vl); + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + clear_vec_regs(); +} + +static void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 +aes_riscv_zvkned_ecb_enc (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ + aes_riscv_zvkned_ecb_crypt (context, outbuf_arg, inbuf_arg, nblocks, 1); +} + +static void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 +aes_riscv_zvkned_ecb_dec (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ + aes_riscv_zvkned_ecb_crypt (context, outbuf_arg, inbuf_arg, nblocks, 0); +} + +void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 +_gcry_aes_riscv_zvkned_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt) +{ + if (encrypt) + aes_riscv_zvkned_ecb_enc (context, outbuf_arg, inbuf_arg, nblocks); + else + aes_riscv_zvkned_ecb_dec (context, outbuf_arg, inbuf_arg, nblocks); +} + +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +_gcry_aes_riscv_zvkned_cfb_enc (void *context, unsigned char *iv_arg, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + const u32 *rk = ctx->keyschenc32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + size_t vl_bytes = vl * 4; + vuint32m1_t iv; + ROUND_KEY_VARIABLES; + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + iv = unaligned_load_u32m1(iv_arg, vl); + + for (; nblocks; nblocks--) + { + vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes); + + AES_CRYPT(e, m1, rounds, iv, vl); + + data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(iv), data, vl_bytes); + __riscv_vse8_v_u8m1(outbuf, data, vl_bytes); + iv = cast_u8m1_u32m1(data); + + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; + } + + unaligned_store_u32m1(iv_arg, iv, vl); + + clear_vec_regs(); +} + +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +_gcry_aes_riscv_zvkned_cbc_enc (void *context, unsigned char *iv_arg, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int cbc_mac) +{ + RIJNDAEL_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + size_t outbuf_add = (!cbc_mac) * BLOCKSIZE; + const u32 *rk = ctx->keyschenc32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + size_t vl_bytes = vl * 4; + vuint32m1_t iv; + ROUND_KEY_VARIABLES; + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + iv = unaligned_load_u32m1(iv_arg, vl); + + for (; nblocks; nblocks--) + { + vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes); + iv = cast_u8m1_u32m1( + __riscv_vxor_vv_u8m1(data, cast_u32m1_u8m1(iv), vl_bytes)); + + AES_CRYPT(e, m1, rounds, iv, vl); + + __riscv_vse8_v_u8m1(outbuf, cast_u32m1_u8m1(iv), vl_bytes); + + inbuf += BLOCKSIZE; + outbuf += outbuf_add; + } + + unaligned_store_u32m1(iv_arg, iv, vl); + + clear_vec_regs(); +} + +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +_gcry_aes_riscv_zvkned_ctr_enc (void *context, unsigned char *ctr_arg, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + static const byte add_u8_array[4][16] = + { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 } + }; + static const u64 carry_add[2] = { 1, 1 }; + static const u64 nocarry_add[2] = { 1, 0 }; + RIJNDAEL_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + const u32 *rk = ctx->keyschenc32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + size_t vl_bytes = vl * 4; + u64 ctrlow; + vuint32m1_t ctr; + vuint8m1_t add1; + ROUND_KEY_VARIABLES; + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + add1 = __riscv_vle8_v_u8m1(add_u8_array[0], vl_bytes); + ctr = unaligned_load_u32m1(ctr_arg, vl); + ctrlow = __riscv_vmv_x_s_u64m1_u64(cast_u32m1_u64m1(bswap128_u32m1(ctr, vl))); + + memory_barrier_with_vec(add1); + + if (nblocks >= 4) + { + vuint8m1_t add2 = __riscv_vle8_v_u8m1(add_u8_array[1], vl_bytes); + vuint8m1_t add3 = __riscv_vle8_v_u8m1(add_u8_array[2], vl_bytes); + vuint8m1_t add4 = __riscv_vle8_v_u8m1(add_u8_array[3], vl_bytes); + + memory_barrier_with_vec(add2); + memory_barrier_with_vec(add3); + memory_barrier_with_vec(add4); + + for (; nblocks >= 4; nblocks -= 4) + { + vuint8m4_t data4blks; + vuint32m4_t ctr4blks; + + /* detect if 8-bit carry handling is needed */ + if (UNLIKELY(((ctrlow += 4) & 0xff) <= 3)) + { + static const u64 *adders[5][4] = + { + { nocarry_add, nocarry_add, nocarry_add, carry_add }, + { nocarry_add, nocarry_add, carry_add, nocarry_add }, + { nocarry_add, carry_add, nocarry_add, nocarry_add }, + { carry_add, nocarry_add, nocarry_add, nocarry_add }, + { nocarry_add, nocarry_add, nocarry_add, nocarry_add } + }; + unsigned int idx = ctrlow <= 3 ? ctrlow : 4; + vuint64m1_t ctr_u64; + vuint32m1_t ctr_u32_1; + vuint32m1_t ctr_u32_2; + vuint32m1_t ctr_u32_3; + vuint32m1_t ctr_u32_4; + vuint64m1_t add_u64; + + /* Byte swap counter */ + ctr_u64 = cast_u32m1_u64m1(bswap128_u32m1(ctr, vl)); + + /* Addition with carry handling */ + add_u64 = __riscv_vle64_v_u64m1(adders[idx][0], vl / 2); + ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2); + ctr_u32_1 = cast_u64m1_u32m1(ctr_u64); + + add_u64 = __riscv_vle64_v_u64m1(adders[idx][1], vl / 2); + ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2); + ctr_u32_2 = cast_u64m1_u32m1(ctr_u64); + + add_u64 = __riscv_vle64_v_u64m1(adders[idx][2], vl / 2); + ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2); + ctr_u32_3 = cast_u64m1_u32m1(ctr_u64); + + add_u64 = __riscv_vle64_v_u64m1(adders[idx][3], vl / 2); + ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2); + ctr_u32_4 = cast_u64m1_u32m1(ctr_u64); + + /* Byte swap counters */ + ctr_u32_1 = bswap128_u32m1(ctr_u32_1, vl); + ctr_u32_2 = bswap128_u32m1(ctr_u32_2, vl); + ctr_u32_3 = bswap128_u32m1(ctr_u32_3, vl); + ctr_u32_4 = bswap128_u32m1(ctr_u32_4, vl); + + ctr4blks = __riscv_vundefined_u32m4(); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, ctr_u32_1); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, ctr_u32_2); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, ctr_u32_3); + ctr = ctr_u32_4; + } + else + { + /* Fast path addition without carry handling */ + vuint8m1_t ctr_u8 = cast_u32m1_u8m1(ctr); + vuint8m1_t ctr1 = __riscv_vadd_vv_u8m1(ctr_u8, add1, vl_bytes); + vuint8m1_t ctr2 = __riscv_vadd_vv_u8m1(ctr_u8, add2, vl_bytes); + vuint8m1_t ctr3 = __riscv_vadd_vv_u8m1(ctr_u8, add3, vl_bytes); + vuint8m4_t ctr0123_u8 = __riscv_vundefined_u8m4(); + + ctr = cast_u8m1_u32m1(__riscv_vadd_vv_u8m1(ctr_u8, add4, + vl_bytes)); + + ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 0, ctr_u8); + ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 1, ctr1); + ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 2, ctr2); + ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 3, ctr3); + + ctr4blks = cast_u8m4_u32m4(ctr0123_u8); + } + + data4blks = __riscv_vle8_v_u8m4(inbuf, vl_bytes * 4); + + AES_CRYPT(e, m4, rounds, ctr4blks, vl * 4); + + data4blks = __riscv_vxor_vv_u8m4(cast_u32m4_u8m4(ctr4blks), data4blks, + vl_bytes * 4); + __riscv_vse8_v_u8m4(outbuf, data4blks, vl_bytes * 4); + + inbuf += 4 * BLOCKSIZE; + outbuf += 4 * BLOCKSIZE; + } + } + + for (; nblocks; nblocks--) + { + vuint32m1_t block = ctr; + vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes); + + /* detect if 8-bit carry handling is needed */ + if (UNLIKELY((++ctrlow & 0xff) == 0)) + { + const u64 *add_arr = UNLIKELY(ctrlow == 0) ? carry_add : nocarry_add; + vuint64m1_t add_val = __riscv_vle64_v_u64m1(add_arr, vl / 2); + + /* Byte swap counter */ + ctr = bswap128_u32m1(ctr, vl); + + /* Addition with carry handling */ + ctr = cast_u64m1_u32m1(__riscv_vadd_vv_u64m1(cast_u32m1_u64m1(ctr), + add_val, vl / 2)); + + /* Byte swap counter */ + ctr = bswap128_u32m1(ctr, vl); + } + else + { + /* Fast path addition without carry handling */ + ctr = cast_u8m1_u32m1(__riscv_vadd_vv_u8m1(cast_u32m1_u8m1(ctr), + add1, vl_bytes)); + } + + AES_CRYPT(e, m1, rounds, block, vl); + + data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(block), data, vl_bytes); + __riscv_vse8_v_u8m1(outbuf, data, vl_bytes); + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + unaligned_store_u32m1(ctr_arg, ctr, vl); + + clear_vec_regs(); +} + +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +_gcry_aes_riscv_zvkned_ctr32le_enc (void *context, unsigned char *ctr_arg, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + static const u32 add_u32_array[4][16] = + { + { 1, }, { 2, }, { 3, }, { 4, } + }; + RIJNDAEL_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + const u32 *rk = ctx->keyschenc32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + size_t vl_bytes = vl * 4; + vuint32m1_t ctr; + vuint32m1_t add1; + ROUND_KEY_VARIABLES; + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + add1 = __riscv_vle32_v_u32m1(add_u32_array[0], vl); + ctr = unaligned_load_u32m1(ctr_arg, vl); + + memory_barrier_with_vec(add1); + + if (nblocks >= 4) + { + vuint32m1_t add2 = __riscv_vle32_v_u32m1(add_u32_array[1], vl); + vuint32m1_t add3 = __riscv_vle32_v_u32m1(add_u32_array[2], vl); + vuint32m1_t add4 = __riscv_vle32_v_u32m1(add_u32_array[3], vl); + + memory_barrier_with_vec(add2); + memory_barrier_with_vec(add3); + memory_barrier_with_vec(add4); + + for (; nblocks >= 4; nblocks -= 4) + { + vuint32m1_t ctr1 = __riscv_vadd_vv_u32m1(ctr, add1, vl); + vuint32m1_t ctr2 = __riscv_vadd_vv_u32m1(ctr, add2, vl); + vuint32m1_t ctr3 = __riscv_vadd_vv_u32m1(ctr, add3, vl); + vuint32m4_t ctr4blks = __riscv_vundefined_u32m4(); + vuint8m4_t data4blks; + + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, ctr1); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, ctr2); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, ctr3); + ctr = __riscv_vadd_vv_u32m1(ctr, add4, vl); + + data4blks = __riscv_vle8_v_u8m4(inbuf, vl_bytes * 4); + + AES_CRYPT(e, m4, rounds, ctr4blks, vl * 4); + + data4blks = __riscv_vxor_vv_u8m4(cast_u32m4_u8m4(ctr4blks), data4blks, + vl_bytes * 4); + __riscv_vse8_v_u8m4(outbuf, data4blks, vl_bytes * 4); + + inbuf += 4 * BLOCKSIZE; + outbuf += 4 * BLOCKSIZE; + } + } + + for (; nblocks; nblocks--) + { + vuint32m1_t block = ctr; + vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes); + + ctr = __riscv_vadd_vv_u32m1(ctr, add1, vl); + + AES_CRYPT(e, m1, rounds, block, vl); + + data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(block), data, vl_bytes); + __riscv_vse8_v_u8m1(outbuf, data, vl_bytes); + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + unaligned_store_u32m1(ctr_arg, ctr, vl); + + clear_vec_regs(); +} + +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +_gcry_aes_riscv_zvkned_cfb_dec (void *context, unsigned char *iv_arg, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + const u32 *rk = ctx->keyschenc32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + vuint32m1_t iv; + ROUND_KEY_VARIABLES; + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + iv = unaligned_load_u32m1(iv_arg, vl); + + for (; nblocks >= 4; nblocks -= 4) + { + vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4); + vuint32m1_t iv1 = __riscv_vget_v_u32m4_u32m1(data4blks, 0); + vuint32m1_t iv2 = __riscv_vget_v_u32m4_u32m1(data4blks, 1); + vuint32m1_t iv3 = __riscv_vget_v_u32m4_u32m1(data4blks, 2); + vuint32m1_t iv4 = __riscv_vget_v_u32m4_u32m1(data4blks, 3); + vuint32m4_t iv4blks = __riscv_vundefined_u32m4(); + + iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 0, iv); + iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 1, iv1); + iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 2, iv2); + iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 3, iv3); + iv = iv4; + + AES_CRYPT(e, m4, rounds, iv4blks, vl * 4); + + data4blks = vxor_u8_u32m4(iv4blks, data4blks, vl * 4); + unaligned_store_u32m4(outbuf, data4blks, vl * 4); + + inbuf += 4 * BLOCKSIZE; + outbuf += 4 * BLOCKSIZE; + } + + for (; nblocks; nblocks--) + { + vuint32m1_t data = unaligned_load_u32m1(inbuf, vl); + vuint32m1_t new_iv = data; + + AES_CRYPT(e, m1, rounds, iv, vl); + + data = vxor_u8_u32m1(iv, data, vl); + unaligned_store_u32m1(outbuf, data, vl); + iv = new_iv; + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + unaligned_store_u32m1(iv_arg, iv, vl); + + clear_vec_regs(); +} + +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +_gcry_aes_riscv_zvkned_cbc_dec (void *context, unsigned char *iv_arg, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + const u32 *rk = ctx->keyschdec32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + vuint32m1_t iv; + ROUND_KEY_VARIABLES; + + if (!ctx->decryption_prepared) + { + do_prepare_decryption(ctx); + ctx->decryption_prepared = 1; + } + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + iv = unaligned_load_u32m1(iv_arg, vl); + + for (; nblocks >= 4; nblocks -= 4) + { + vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4); + vuint32m1_t iv1 = __riscv_vget_v_u32m4_u32m1(data4blks, 0); + vuint32m1_t iv2 = __riscv_vget_v_u32m4_u32m1(data4blks, 1); + vuint32m1_t iv3 = __riscv_vget_v_u32m4_u32m1(data4blks, 2); + vuint32m1_t iv4 = __riscv_vget_v_u32m4_u32m1(data4blks, 3); + vuint32m4_t iv4blks = __riscv_vundefined_u32m4(); + + iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 0, iv); + iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 1, iv1); + iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 2, iv2); + iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 3, iv3); + + AES_CRYPT(d, m4, rounds, data4blks, vl * 4); + + data4blks = vxor_u8_u32m4(iv4blks, data4blks, vl * 4); + unaligned_store_u32m4(outbuf, data4blks, vl * 4); + iv = iv4; + + inbuf += 4 * BLOCKSIZE; + outbuf += 4 * BLOCKSIZE; + } + + for (; nblocks; nblocks--) + { + vuint32m1_t data = unaligned_load_u32m1(inbuf, vl); + vuint32m1_t new_iv = data; + + AES_CRYPT(d, m1, rounds, data, vl); + + data = vxor_u8_u32m1(iv, data, vl); + unaligned_store_u32m1(outbuf, data, vl); + iv = new_iv; + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + unaligned_store_u32m1(iv_arg, iv, vl); + + clear_vec_regs(); +} + +static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 size_t +aes_riscv_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ + RIJNDAEL_context *ctx = (void *)&c->context.c; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + u64 n = c->u_mode.ocb.data_nblocks; + const u32 *rk = ctx->keyschenc32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + size_t vl_bytes = vl * 4; + vuint32m1_t iv; + vuint32m1_t ctr; + ROUND_KEY_VARIABLES; + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + /* Preload Offset and Checksum */ + iv = unaligned_load_u32m1(c->u_iv.iv, vl); + ctr = unaligned_load_u32m1(c->u_ctr.ctr, vl); + + if (nblocks >= 4) + { + vuint32m4_t ctr4blks = __riscv_vundefined_u32m4(); + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl); + + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero); + + for (; nblocks >= 4; nblocks -= 4) + { + const unsigned char *l; + vuint8m1_t l_ntzi; + vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4); + vuint32m4_t offsets = __riscv_vundefined_u32m4(); + + /* Checksum_i = Checksum_{i-1} xor P_i */ + ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv); + + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv); + + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv); + + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv); + + data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4); + + AES_CRYPT(e, m4, rounds, data4blks, vl * 4); + + data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4); + + unaligned_store_u32m4(outbuf, data4blks, vl * 4); + + inbuf += 4 * BLOCKSIZE; + outbuf += 4 * BLOCKSIZE; + } + + /* Checksum_i = Checksum_{i-1} xor P_i */ + ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0), + __riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl); + ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl); + ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl); + } + + for (; nblocks; nblocks--) + { + const unsigned char *l; + vuint8m1_t l_ntzi; + vuint32m1_t data; + + data = unaligned_load_u32m1(inbuf, vl); + + /* Checksum_i = Checksum_{i-1} xor P_i */ + ctr = vxor_u8_u32m1(ctr, data, vl); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + + data = vxor_u8_u32m1(data, iv, vl); + + AES_CRYPT(e, m1, rounds, data, vl); + + data = vxor_u8_u32m1(iv, data, vl); + unaligned_store_u32m1(outbuf, data, vl); + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + c->u_mode.ocb.data_nblocks = n; + + unaligned_store_u32m1(c->u_iv.iv, iv, vl); + unaligned_store_u32m1(c->u_ctr.ctr, ctr, vl); + + clear_vec_regs(); + + return 0; +} + +static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 size_t +aes_riscv_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ + RIJNDAEL_context *ctx = (void *)&c->context.c; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + u64 n = c->u_mode.ocb.data_nblocks; + const u32 *rk = ctx->keyschdec32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + size_t vl_bytes = vl * 4; + vuint32m1_t iv; + vuint32m1_t ctr; + ROUND_KEY_VARIABLES; + + if (!ctx->decryption_prepared) + { + do_prepare_decryption(ctx); + ctx->decryption_prepared = 1; + } + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + /* Preload Offset and Checksum */ + iv = unaligned_load_u32m1(c->u_iv.iv, vl); + ctr = unaligned_load_u32m1(c->u_ctr.ctr, vl); + + if (nblocks >= 4) + { + vuint32m4_t ctr4blks = __riscv_vundefined_u32m4(); + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl); + + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero); + + for (; nblocks >= 4; nblocks -= 4) + { + const unsigned char *l; + vuint8m1_t l_ntzi; + vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4); + vuint32m4_t offsets = __riscv_vundefined_u32m4(); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i) */ + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv); + + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv); + + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv); + + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv); + + data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4); + + AES_CRYPT(d, m4, rounds, data4blks, vl * 4); + + data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4); + + unaligned_store_u32m4(outbuf, data4blks, vl * 4); + + /* Checksum_i = Checksum_{i-1} xor P_i */ + ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4); + + inbuf += 4 * BLOCKSIZE; + outbuf += 4 * BLOCKSIZE; + } + + /* Checksum_i = Checksum_{i-1} xor P_i */ + ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0), + __riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl); + ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl); + ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl); + } + + for (; nblocks; nblocks--) + { + const unsigned char *l; + vuint8m1_t l_ntzi; + vuint8m1_t data; + vuint32m1_t block; + + l = ocb_get_l(c, ++n); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + data = __riscv_vle8_v_u8m1(inbuf, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + data = __riscv_vxor_vv_u8m1(data, cast_u32m1_u8m1(iv), vl_bytes); + block = cast_u8m1_u32m1(data); + + AES_CRYPT(d, m1, rounds, block, vl); + + block = vxor_u8_u32m1(iv, block, vl); + unaligned_store_u32m1(outbuf, block, vl); + + /* Checksum_i = Checksum_{i-1} xor P_i */ + ctr = vxor_u8_u32m1(ctr, block, vl); + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + c->u_mode.ocb.data_nblocks = n; + + unaligned_store_u32m1(c->u_iv.iv, iv, vl); + unaligned_store_u32m1(c->u_ctr.ctr, ctr, vl); + + clear_vec_regs(); + + return 0; +} + +size_t ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 +_gcry_aes_riscv_zvkned_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt) +{ + if (encrypt) + return aes_riscv_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks); + else + return aes_riscv_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks); +} + +size_t ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 +_gcry_aes_riscv_zvkned_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, + size_t nblocks) +{ + RIJNDAEL_context *ctx = (void *)&c->context.c; + const unsigned char *abuf = abuf_arg; + u64 n = c->u_mode.ocb.aad_nblocks; + const u32 *rk = ctx->keyschenc32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + size_t vl_bytes = vl * 4; + vuint32m1_t iv; + vuint32m1_t ctr; + ROUND_KEY_VARIABLES; + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + /* Preload Offset and Sum */ + iv = unaligned_load_u32m1(c->u_mode.ocb.aad_offset, vl); + ctr = unaligned_load_u32m1(c->u_mode.ocb.aad_sum, vl); + + if (nblocks >= 4) + { + vuint32m4_t ctr4blks = __riscv_vundefined_u32m4(); + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl); + + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero); + ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero); + + for (; nblocks >= 4; nblocks -= 4) + { + const unsigned char *l; + vuint8m1_t l_ntzi; + vuint32m4_t data4blks = unaligned_load_u32m4(abuf, vl * 4); + vuint32m4_t offsets = __riscv_vundefined_u32m4(); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv); + + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv); + + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv); + + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv); + + data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4); + + AES_CRYPT(e, m4, rounds, data4blks, vl * 4); + + ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4); + + abuf += 4 * BLOCKSIZE; + } + + /* Checksum_i = Checksum_{i-1} xor P_i */ + ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0), + __riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl); + ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl); + ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl); + } + + for (; nblocks; nblocks--) + { + const unsigned char *l; + vuint8m1_t l_ntzi; + vuint32m1_t data; + + data = unaligned_load_u32m1(abuf, vl); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + l = ocb_get_l(c, ++n); + l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes); + iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl); + + data = vxor_u8_u32m1(data, iv, vl); + + AES_CRYPT(e, m1, rounds, data, vl); + + ctr = vxor_u8_u32m1(ctr, data, vl); + + abuf += BLOCKSIZE; + } + + c->u_mode.ocb.aad_nblocks = n; + + unaligned_store_u32m1(c->u_mode.ocb.aad_offset, iv, vl); + unaligned_store_u32m1(c->u_mode.ocb.aad_sum, ctr, vl); + + clear_vec_regs(); + + return 0; +} + +static const u64 xts_gfmul_const[2] = { 0x87, 0x01 }; +static const u64 xts_swap64_const[2] = { 1, 0 }; + +static ASM_FUNC_ATTR_INLINE vuint32m1_t +xts_gfmul_byA (vuint32m1_t vec_in, vuint64m1_t xts_gfmul, + vuint64m1_t xts_swap64, size_t vl) +{ + vuint64m1_t in_u64 = cast_u32m1_u64m1(vec_in); + vuint64m1_t tmp1; + + tmp1 = + __riscv_vrgather_vv_u64m1(cast_u32m1_u64m1(vec_in), xts_swap64, vl / 2); + tmp1 = cast_i64m1_u64m1( + __riscv_vsra_vx_i64m1(cast_u64m1_i64m1(tmp1), 63, vl / 2)); + in_u64 = __riscv_vadd_vv_u64m1(in_u64, in_u64, vl / 2); + tmp1 = __riscv_vand_vv_u64m1(tmp1, xts_gfmul, vl / 2); + + return cast_u64m1_u32m1(__riscv_vxor_vv_u64m1(in_u64, tmp1, vl / 2)); +} + +static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +aes_riscv_xts_enc (void *context, unsigned char *tweak_arg, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + const u32 *rk = ctx->keyschenc32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + vuint32m1_t tweak; + vuint64m1_t xts_gfmul = __riscv_vle64_v_u64m1(xts_gfmul_const, vl / 2); + vuint64m1_t xts_swap64 = __riscv_vle64_v_u64m1(xts_swap64_const, vl / 2); + ROUND_KEY_VARIABLES; + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + /* Preload tweak */ + tweak = unaligned_load_u32m1(tweak_arg, vl); + + memory_barrier_with_vec(xts_gfmul); + memory_barrier_with_vec(xts_swap64); + + for (; nblocks >= 4; nblocks -= 4) + { + vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4); + vuint32m4_t tweaks = __riscv_vundefined_u32m4(); + + tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 0, tweak); + tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl); + tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 1, tweak); + tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl); + tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 2, tweak); + tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl); + tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 3, tweak); + tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl); + + data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4); + + AES_CRYPT(e, m4, rounds, data4blks, vl * 4); + + data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4); + + unaligned_store_u32m4(outbuf, data4blks, vl * 4); + + inbuf += 4 * BLOCKSIZE; + outbuf += 4 * BLOCKSIZE; + } + + for (; nblocks; nblocks--) + { + vuint32m1_t data = unaligned_load_u32m1(inbuf, vl); + vuint32m1_t tweak0 = tweak; + + data = vxor_u8_u32m1(data, tweak0, vl); + tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl); + + AES_CRYPT(e, m1, rounds, data, vl); + + data = vxor_u8_u32m1(data, tweak0, vl); + unaligned_store_u32m1(outbuf, data, vl); + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + unaligned_store_u32m1(tweak_arg, tweak, vl); + + clear_vec_regs(); +} + +static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +aes_riscv_xts_dec (void *context, unsigned char *tweak_arg, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + const u32 *rk = ctx->keyschdec32[0]; + int rounds = ctx->rounds; + size_t vl = 4; + vuint32m1_t tweak; + vuint64m1_t xts_gfmul = __riscv_vle64_v_u64m1(xts_gfmul_const, vl / 2); + vuint64m1_t xts_swap64 = __riscv_vle64_v_u64m1(xts_swap64_const, vl / 2); + ROUND_KEY_VARIABLES; + + if (!ctx->decryption_prepared) + { + do_prepare_decryption(ctx); + ctx->decryption_prepared = 1; + } + + PRELOAD_ROUND_KEYS (rk, rounds, vl); + + /* Preload tweak */ + tweak = unaligned_load_u32m1(tweak_arg, vl); + + memory_barrier_with_vec(xts_gfmul); + memory_barrier_with_vec(xts_swap64); + + for (; nblocks >= 4; nblocks -= 4) + { + vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4); + vuint32m4_t tweaks = __riscv_vundefined_u32m4(); + + tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 0, tweak); + tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl); + tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 1, tweak); + tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl); + tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 2, tweak); + tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl); + tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 3, tweak); + tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl); + + data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4); + + AES_CRYPT(d, m4, rounds, data4blks, vl * 4); + + data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4); + + unaligned_store_u32m4(outbuf, data4blks, vl * 4); + + inbuf += 4 * BLOCKSIZE; + outbuf += 4 * BLOCKSIZE; + } + + for (; nblocks; nblocks--) + { + vuint32m1_t data = unaligned_load_u32m1(inbuf, vl); + vuint32m1_t tweak0 = tweak; + + data = vxor_u8_u32m1(data, tweak0, vl); + tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl); + + AES_CRYPT(d, m1, rounds, data, vl); + + data = vxor_u8_u32m1(data, tweak0, vl); + unaligned_store_u32m1(outbuf, data, vl); + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + unaligned_store_u32m1(tweak_arg, tweak, vl); + + clear_vec_regs(); +} + +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void +_gcry_aes_riscv_zvkned_xts_crypt (void *context, unsigned char *tweak_arg, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt) +{ + if (encrypt) + aes_riscv_xts_enc(context, tweak_arg, outbuf_arg, inbuf_arg, nblocks); + else + aes_riscv_xts_dec(context, tweak_arg, outbuf_arg, inbuf_arg, nblocks); +} + +#endif /* HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS */ diff --git a/cipher/rijndael.c b/cipher/rijndael.c index 0c48793b..52500e59 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -280,6 +280,63 @@ extern void _gcry_aes_vp_riscv_xts_crypt (void *context, unsigned char *tweak, size_t nblocks, int encrypt); #endif +#ifdef USE_RISCV_V_CRYPTO +/* RISC-V vector cryptography extension implementation of AES */ +extern int +_gcry_aes_riscv_zvkned_setup_acceleration (RIJNDAEL_context *ctx); + +extern void +_gcry_aes_riscv_zvkned_setkey (RIJNDAEL_context *ctx, const byte *key); +extern void +_gcry_aes_riscv_zvkned_prepare_decryption (RIJNDAEL_context *ctx); + +extern unsigned int +_gcry_aes_riscv_zvkned_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, + const unsigned char *src); +extern unsigned int +_gcry_aes_riscv_zvkned_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, + const unsigned char *src); +extern void +_gcry_aes_riscv_zvkned_cfb_enc (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void +_gcry_aes_riscv_zvkned_cbc_enc (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int cbc_mac); +extern void +_gcry_aes_riscv_zvkned_ctr_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void +_gcry_aes_riscv_zvkned_ctr32le_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void +_gcry_aes_riscv_zvkned_cfb_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void +_gcry_aes_riscv_zvkned_cbc_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern size_t +_gcry_aes_riscv_zvkned_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); +extern size_t +_gcry_aes_riscv_zvkned_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, + size_t nblocks); +extern void +_gcry_aes_riscv_zvkned_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); +extern void +_gcry_aes_riscv_zvkned_xts_crypt (void *context, unsigned char *tweak, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt); +#endif + #ifdef USE_PADLOCK extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, @@ -774,9 +831,36 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, bulk_ops->xts_crypt = _gcry_aes_vp_aarch64_xts_crypt; } #endif +#ifdef USE_RISCV_V_CRYPTO + else if ((hwfeatures & HWF_RISCV_IMAFDC) + && (hwfeatures & HWF_RISCV_V) + && (hwfeatures & HWF_RISCV_ZVKNED) + && _gcry_aes_riscv_zvkned_setup_acceleration(ctx)) + { + hw_setkey = _gcry_aes_riscv_zvkned_setkey; + ctx->encrypt_fn = _gcry_aes_riscv_zvkned_encrypt; + ctx->decrypt_fn = _gcry_aes_riscv_zvkned_decrypt; + ctx->prefetch_enc_fn = NULL; + ctx->prefetch_dec_fn = NULL; + ctx->prepare_decryption = _gcry_aes_riscv_zvkned_prepare_decryption; + + /* Setup RISC-V vector cryptography bulk encryption routines. */ + bulk_ops->cfb_enc = _gcry_aes_riscv_zvkned_cfb_enc; + bulk_ops->cfb_dec = _gcry_aes_riscv_zvkned_cfb_dec; + bulk_ops->cbc_enc = _gcry_aes_riscv_zvkned_cbc_enc; + bulk_ops->cbc_dec = _gcry_aes_riscv_zvkned_cbc_dec; + bulk_ops->ctr_enc = _gcry_aes_riscv_zvkned_ctr_enc; + bulk_ops->ctr32le_enc = _gcry_aes_riscv_zvkned_ctr32le_enc; + bulk_ops->ocb_crypt = _gcry_aes_riscv_zvkned_ocb_crypt; + bulk_ops->ocb_auth = _gcry_aes_riscv_zvkned_ocb_auth; + bulk_ops->ecb_crypt = _gcry_aes_riscv_zvkned_ecb_crypt; + bulk_ops->xts_crypt = _gcry_aes_riscv_zvkned_xts_crypt; + } +#endif #ifdef USE_VP_RISCV - else if ((hwfeatures & HWF_RISCV_IMAFDC) && (hwfeatures & HWF_RISCV_V) && - _gcry_aes_vp_riscv_setup_acceleration(ctx)) + else if ((hwfeatures & HWF_RISCV_IMAFDC) + && (hwfeatures & HWF_RISCV_V) + && _gcry_aes_vp_riscv_setup_acceleration(ctx)) { hw_setkey = _gcry_aes_vp_riscv_do_setkey; ctx->encrypt_fn = _gcry_aes_vp_riscv_encrypt; @@ -785,7 +869,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, ctx->prefetch_dec_fn = NULL; ctx->prepare_decryption = _gcry_aes_vp_riscv_prepare_decryption; - /* Setup vector permute AArch64 bulk encryption routines. */ + /* Setup vector permute RISC-V bulk encryption routines. */ bulk_ops->cfb_enc = _gcry_aes_vp_riscv_cfb_enc; bulk_ops->cfb_dec = _gcry_aes_vp_riscv_cfb_dec; bulk_ops->cbc_enc = _gcry_aes_vp_riscv_cbc_enc; diff --git a/configure.ac b/configure.ac index d45ea851..45fe5143 100644 --- a/configure.ac +++ b/configure.ac @@ -2812,6 +2812,32 @@ m4_define([GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST], __riscv_vse32_v_u32m1(ptr + 4 * vl, w2, vl); __riscv_vse32_v_u32m1(ptr + 5 * vl, w3, vl); } + void test_aes_key(unsigned int *ptr) + { + int vl = __riscv_vsetvl_e32m1 (4); + vuint32m1_t a = __riscv_vle32_v_u32m1(ptr + 0 * vl, vl); + vuint32m1_t b = __riscv_vle32_v_u32m1(ptr + 1 * vl, vl); + vuint32m1_t c = __riscv_vaeskf1_vi_u32m1(a, 1, vl); + vuint32m1_t d = __riscv_vaeskf2_vi_u32m1(a, b, 2, vl); + __riscv_vse32_v_u32m1(ptr + 0 * vl, c, vl); + __riscv_vse32_v_u32m1(ptr + 1 * vl, d, vl); + } + void test_aes_crypt(unsigned int *ptr) + { + int vl = __riscv_vsetvl_e32m1 (4); + vuint32m1_t a = __riscv_vle32_v_u32m1(ptr + 0 * vl, vl); + vuint32m1_t b = __riscv_vle32_v_u32m1(ptr + 1 * vl, vl); + vuint32m1_t c = __riscv_vaesz_vs_u32m1_u32m1(a, b, vl); + vuint32m1_t d = __riscv_vaesem_vs_u32m1_u32m1(a, b, vl); + vuint32m1_t e = __riscv_vaesef_vs_u32m1_u32m1(a, b, vl); + vuint32m1_t f = __riscv_vaesdm_vs_u32m1_u32m1(a, b, vl); + vuint32m1_t g = __riscv_vaesdf_vs_u32m1_u32m1(a, b, vl); + __riscv_vse32_v_u32m1(ptr + 0 * vl, c, vl); + __riscv_vse32_v_u32m1(ptr + 1 * vl, d, vl); + __riscv_vse32_v_u32m1(ptr + 2 * vl, e, vl); + __riscv_vse32_v_u32m1(ptr + 3 * vl, f, vl); + __riscv_vse32_v_u32m1(ptr + 4 * vl, g, vl); + } void test_inline_vec_asm(unsigned int *ptr) { int vl = __riscv_vsetvl_e32m1 (4); @@ -2918,6 +2944,53 @@ EOF CFLAGS=$_gcc_cflags_save; fi +# +# Check whether compiler has working RISC-V vector __riscv_vaes*_vs intrinsics +# +# Some GCC versions generate a 'vsetvli' instruction with an incorrect 'm1' +# LMUL instead of the expected 'mX' for the __riscv_vaes*_vs_u32m1_u32mX +# intrinsics. +# +if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics" = "yes" || + test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes"; then + + # Setup flags for test. + _gcc_cflags_save=$CFLAGS + if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes"; then + CFLAGS="$CFLAGS -O2 -march=rv64imafdcv_zvbc_zvkg_zvkn_zvks -mstrict-align" + else + CFLAGS="$CFLAGS -O2" + fi + + AC_CACHE_CHECK([whether compiler has working RISC-V __riscv_vaes*_vs intrinsics], + [gcry_cv_riscv_vaes_vs_intrinsics_work], + [gcry_cv_riscv_vaes_vs_intrinsics_work=no + cat > conftest.c <<EOF +#include <riscv_vector.h> +vuint32m4_t test_fn(vuint32m4_t a, vuint32m1_t b, int vl) +{ + /* This intrinsic should result in a 'vsetvli' with m4 */ + return __riscv_vaesem_vs_u32m1_u32m4(a, b, vl); +} +EOF + + if $CC $CFLAGS -S conftest.c -o conftest.s >&5 2>&5; then + if grep -E 'vsetvli.*,[[[:space:]]]*m4[[[:space:]]]*,' conftest.s >/dev/null 2>&1; then + gcry_cv_riscv_vaes_vs_intrinsics_work=yes + fi + fi + rm -f conftest.* + ]) + + if test "$gcry_cv_riscv_vaes_vs_intrinsics_work" = "no"; then + AC_DEFINE([HAVE_BROKEN_VAES_VS_INTRINSIC], [1], + [Define to 1 if __riscv_vaes*_vs intrinsics are broken]) + fi + + # Restore flags. + CFLAGS=$_gcc_cflags_save; +fi + ####################################### #### Checks for library functions. #### @@ -3390,6 +3463,9 @@ if test "$found" = "1" ; then riscv64-*-*) # Build with the vector permute SIMD128 implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vp-riscv.lo" + + # Build with the RISC-V vector cryptography implementation + GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-riscv-zvkned.lo" ;; s390x-*-*) # Big-Endian. diff --git a/src/g10lib.h b/src/g10lib.h index 4fa91ba9..991ec3ea 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -281,8 +281,9 @@ char **_gcry_strtokenize (const char *string, const char *delim); #define HWF_RISCV_ZBB (1 << 2) #define HWF_RISCV_ZBC (1 << 3) #define HWF_RISCV_ZVKB (1 << 4) -#define HWF_RISCV_ZVKNHA (1 << 5) -#define HWF_RISCV_ZVKNHB (1 << 6) +#define HWF_RISCV_ZVKNED (1 << 5) +#define HWF_RISCV_ZVKNHA (1 << 6) +#define HWF_RISCV_ZVKNHB (1 << 7) #endif diff --git a/src/hwf-riscv.c b/src/hwf-riscv.c index 925284a1..c37fd8dc 100644 --- a/src/hwf-riscv.c +++ b/src/hwf-riscv.c @@ -191,6 +191,7 @@ detect_riscv_at_hwcap(void) #define HWF_RISCV_HWPROBE_EXT_ZBS (1U << 5) #define HWF_RISCV_HWPROBE_EXT_ZBC (1U << 7) #define HWF_RISCV_HWPROBE_EXT_ZVKB (1U << 19) +#define HWF_RISCV_HWPROBE_EXT_ZVKNED (1U << 21) #define HWF_RISCV_HWPROBE_EXT_ZVKNHA (1U << 22) #define HWF_RISCV_HWPROBE_EXT_ZVKNHB (1U << 23) #define HWF_RISCV_HWPROBE_EXT_ZICOND (U64_C(1) << 35) @@ -215,6 +216,7 @@ static const struct hwprobe_feature_map_s hwprobe_features[] = { HWF_RISCV_HWPROBE_EXT_ZBB, HWF_RISCV_ZBB }, { HWF_RISCV_HWPROBE_EXT_ZBC, HWF_RISCV_ZBC }, { HWF_RISCV_HWPROBE_EXT_ZVKB, HWF_RISCV_ZVKB }, + { HWF_RISCV_HWPROBE_EXT_ZVKNED, HWF_RISCV_ZVKNED }, { HWF_RISCV_HWPROBE_EXT_ZVKNHA, HWF_RISCV_ZVKNHA }, { HWF_RISCV_HWPROBE_EXT_ZVKNHB, HWF_RISCV_ZVKNHB }, }; diff --git a/src/hwfeatures.c b/src/hwfeatures.c index df2aaf17..0752d787 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -99,6 +99,7 @@ static struct { HWF_RISCV_ZVKB, "riscv-zvkb" }, { HWF_RISCV_ZVKNHA, "riscv-zvknha" }, { HWF_RISCV_ZVKNHB, "riscv-zvknhb" }, + { HWF_RISCV_ZVKNED, "riscv-zvkned" }, #endif }; -- 2.48.1 _______________________________________________ Gcrypt-devel mailing list Gcrypt-devel@gnupg.org https://lists.gnupg.org/mailman/listinfo/gcrypt-devel