[PATCH 5/6] chacha20: add RISC-V vector intrinsics implementation

Jussi Kivilinna Mon, 06 Jan 2025 07:51:05 -0800

* cipher/Makefile.am: Add 'chacha20-riscv-v.c' and
add ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS handling for
'chacha20-riscv-v.o' and 'chacha20-riscv-v.lo'.
* cipher/chacha20-riscv-v.c: New.
* cipher/chacha20.c (USE_RISCV_V): New.
(CHACHA20_context_s): Add 'use_riscv_v'.
[USE_RISCV_V] (_gcry_chacha20_riscv_v_blocks)
(_gcry_chacha20_riscv_v_check_hw): New.
(chacha20_blocks) [USE_RISCV_V]: Add RISC-V vector code path.
(chacha20_do_setkey) [USE_RISCV_V]: Add HW feature detection for
RISC-V vector implementation.
* configure.ac: Add 'chacha20-riscv-v.lo'.
--


Patch adds RISC-V vector extension implementation. Variable length
vector implementation is used for large inputs (4 blocks or more blocks)
and fixed width 128-bit vector implementation is used for shorter input.

Benchmark on SpacemiT K1 (1600 Mhz):

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     10.67 ns/B     89.37 MiB/s     17.07 c/B

After (3x faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      3.41 ns/B     279.9 MiB/s      5.45 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivili...@iki.fi>
---
 cipher/Makefile.am        |  10 +-
 cipher/chacha20-riscv-v.c | 565 ++++++++++++++++++++++++++++++++++++++
 cipher/chacha20.c         |  29 ++
 configure.ac              |   4 +
 4 files changed, 606 insertions(+), 2 deletions(-)
 create mode 100644 cipher/chacha20-riscv-v.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index a0a4d7d8..d871d38d 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -87,8 +87,8 @@ EXTRA_libcipher_la_SOURCES = \
        cast5.c cast5-amd64.S cast5-arm.S \
        chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
        chacha20-amd64-avx512.S chacha20-armv7-neon.S chacha20-aarch64.S \
-       chacha20-ppc.c chacha20-s390x.S \
-       chacha20-p10le-8x.s \
+       chacha20-ppc.c chacha20-s390x.S chacha20-p10le-8x.s \
+       chacha20-riscv-v.c \
        cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c \
        cipher-gcm-aarch64-simd.c cipher-gcm-armv7-neon.S \
        cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
@@ -359,6 +359,12 @@ else
 riscv_vector_cflags =
 endif
 
+chacha20-riscv-v.o: $(srcdir)/chacha20-riscv-v.c Makefile
+       `echo $(COMPILE) $(riscv_vector_cflags) -c $< | 
$(instrumentation_munging) `
+
+chacha20-riscv-v.lo: $(srcdir)/chacha20-riscv-v.c Makefile
+       `echo $(LTCOMPILE) $(riscv_vector_cflags) -c $< | 
$(instrumentation_munging) `
+
 rijndael-vp-riscv.o: $(srcdir)/rijndael-vp-riscv.c Makefile
        `echo $(COMPILE) $(riscv_vector_cflags) -c $< | 
$(instrumentation_munging) `
 
diff --git a/cipher/chacha20-riscv-v.c b/cipher/chacha20-riscv-v.c
new file mode 100644
index 00000000..1304a333
--- /dev/null
+++ b/cipher/chacha20-riscv-v.c
@@ -0,0 +1,565 @@
+/* chacha20-riscv-v.c - RISC-V vector implementation of ChaCha20
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivili...@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined (__riscv) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+    defined(USE_CHACHA20)
+
+#include "simd-common-riscv.h"
+#include <riscv_vector.h>
+#include "bufhelp.h"
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+/**********************************************************************
+  RISC-V vector extension chacha20
+ **********************************************************************/
+
+#define ROTATE16(v)    __riscv_vreinterpret_v_u16m1_u32m1( \
+                               __riscv_vrgather_vv_u16m1( \
+                                       __riscv_vreinterpret_v_u32m1_u16m1(v), \
+                                       rot16, vl * 2))
+#define ROTATE8(v)     __riscv_vreinterpret_v_u8m1_u32m1( \
+                               __riscv_vrgather_vv_u8m1( \
+                                       __riscv_vreinterpret_v_u32m1_u8m1(v), \
+                                       rot8, vl * 4))
+#define ROTATE(v, c)   __riscv_vadd_vv_u32m1( \
+                               __riscv_vsll_vx_u32m1((v), (c), vl), \
+                               __riscv_vsrl_vx_u32m1((v), 32 - (c), vl), vl)
+#define XOR(v, w)      __riscv_vxor_vv_u32m1((v), (w), vl)
+#define PLUS(v, w)     __riscv_vadd_vv_u32m1((v), (w), vl)
+#define WORD_ROL(v, c) __riscv_vrgather_vv_u32m1((v), (rol##c), vl)
+
+#define QUARTERROUND_4(a0, b0, c0, d0, a1, b1, c1, d1, \
+                      a2, b2, c2, d2, a3, b3, c3, d3) \
+  a0 = PLUS(a0, b0); a1 = PLUS(a1, b1); \
+  a2 = PLUS(a2, b2); a3 = PLUS(a3, b3); \
+    d0 = XOR(d0, a0); d1 = XOR(d1, a1); \
+    d2 = XOR(d2, a2); d3 = XOR(d3, a3); \
+      d0 = ROTATE16(d0); d1 = ROTATE16(d1); \
+      d2 = ROTATE16(d2); d3 = ROTATE16(d3); \
+  c0 = PLUS(c0, d0); c1 = PLUS(c1, d1); \
+  c2 = PLUS(c2, d2); c3 = PLUS(c3, d3); \
+    b0 = XOR(b0, c0); b1 = XOR(b1, c1); \
+    b2 = XOR(b2, c2); b3 = XOR(b3, c3); \
+      b0 = ROTATE(b0, 12); b1 = ROTATE(b1, 12); \
+      b2 = ROTATE(b2, 12); b3 = ROTATE(b3, 12); \
+  a0 = PLUS(a0, b0); a1 = PLUS(a1, b1); \
+  a2 = PLUS(a2, b2); a3 = PLUS(a3, b3); \
+    d0 = XOR(d0, a0); d1 = XOR(d1, a1); \
+    d2 = XOR(d2, a2); d3 = XOR(d3, a3); \
+      d0 = ROTATE8(d0); d1 = ROTATE8(d1); \
+      d2 = ROTATE8(d2); d3 = ROTATE8(d3); \
+  c0 = PLUS(c0, d0); c1 = PLUS(c1, d1); \
+  c2 = PLUS(c2, d2); c3 = PLUS(c3, d3); \
+    b0 = XOR(b0, c0); b1 = XOR(b1, c1); \
+    b2 = XOR(b2, c2); b3 = XOR(b3, c3); \
+      b0 = ROTATE(b0, 7); b1 = ROTATE(b1, 7); \
+      b2 = ROTATE(b2, 7); b3 = ROTATE(b3, 7);
+
+#define QUARTERROUND4_2(x0, x1, x2, x3, y0, y1, y2, y3, rol_x1, rol_x2, 
rol_x3) \
+  x0 = PLUS(x0, x1); y0 = PLUS(y0, y1); \
+    x3 = XOR(x3, x0); y3 = XOR(y3, y0); \
+      x3 = ROTATE16(x3); y3 = ROTATE16(y3); \
+  x2 = PLUS(x2, x3); y2 = PLUS(y2, y3); \
+    x1 = XOR(x1, x2); y1 = XOR(y1, y2); \
+      x1 = ROTATE(x1, 12); y1 = ROTATE(y1, 12); \
+  x0 = PLUS(x0, x1); y0 = PLUS(y0, y1); \
+    x3 = XOR(x3, x0); y3 = XOR(y3, y0); \
+      x3 = ROTATE8(x3); y3 = ROTATE8(y3); \
+  x2 = PLUS(x2, x3); y2 = PLUS(y2, y3); \
+    x3 = WORD_ROL(x3, rol_x3); y3 = WORD_ROL(y3, rol_x3);\
+      x1 = XOR(x1, x2); y1 = XOR(y1, y2); \
+       x2 = WORD_ROL(x2, rol_x2); y2 = WORD_ROL(y2, rol_x2); \
+         x1 = ROTATE(x1, 7); y1 = ROTATE(y1, 7); \
+           x1 = WORD_ROL(x1, rol_x1); y1 = WORD_ROL(y1, rol_x1);
+
+#define QUARTERROUND4(x0, x1, x2, x3, rol_x1, rol_x2, rol_x3) \
+  x0 = PLUS(x0, x1); x3 = XOR(x3, x0); x3 = ROTATE16(x3); \
+  x2 = PLUS(x2, x3); x1 = XOR(x1, x2); x1 = ROTATE(x1, 12); \
+  x0 = PLUS(x0, x1); x3 = XOR(x3, x0); x3 = ROTATE8(x3); \
+  x2 = PLUS(x2, x3); \
+    x3 = WORD_ROL(x3, rol_x3); \
+                    x1 = XOR(x1, x2); \
+    x2 = WORD_ROL(x2, rol_x2); \
+                                      x1= ROTATE(x1, 7); \
+    x1 = WORD_ROL(x1, rol_x1);
+
+#define ADD_U64(a, b) __riscv_vreinterpret_v_u64m1_u32m1( \
+                       __riscv_vadd_vv_u64m1( \
+                         __riscv_vreinterpret_v_u32m1_u64m1(a), \
+                         __riscv_vreinterpret_v_u32m1_u64m1(b), vl / 2))
+
+#define vxor_v_u32m1_u32m1x8(data, idx, vs, vl) \
+      __riscv_vset_v_u32m1_u32m1x8((data), (idx), \
+         __riscv_vxor_vv_u32m1( \
+             __riscv_vget_v_u32m1x8_u32m1((data), (idx)), (vs), (vl)))
+
+static ASM_FUNC_ATTR_INLINE vuint16m1_t
+gen_rot16(size_t vl)
+{
+  return __riscv_vxor_vx_u16m1(__riscv_vid_v_u16m1(vl * 2), 1, vl * 2);
+}
+
+static ASM_FUNC_ATTR_INLINE vuint8m1_t
+gen_rot8(size_t vl)
+{
+  vuint8m1_t rot8, rot8_hi;
+
+  rot8 = __riscv_vid_v_u8m1(vl * 4);
+  rot8_hi = __riscv_vand_vx_u8m1(rot8, ~3, vl * 4);
+  rot8 = __riscv_vadd_vx_u8m1(rot8, 3, vl * 4);
+  rot8 = __riscv_vand_vx_u8m1(rot8, 3, vl * 4);
+  rot8 = __riscv_vadd_vv_u8m1(rot8, rot8_hi, vl * 4);
+
+  return rot8;
+}
+
+static ASM_FUNC_ATTR_INLINE vuint16m2_t
+gen_indexes(size_t vl, size_t stride)
+{
+  vuint16m2_t idx = __riscv_vid_v_u16m2(vl * 4);
+  vuint16m2_t idx_lo = __riscv_vand_vx_u16m2(idx, 3, vl * 4);
+  vuint16m2_t idx_hi = __riscv_vsrl_vx_u16m2(idx, 2, vl * 4);
+  idx_hi = __riscv_vmul_vx_u16m2(idx_hi, stride, vl * 4);
+  return __riscv_vadd_vv_u16m2(idx_hi, idx_lo, vl * 4);
+}
+
+static ASM_FUNC_ATTR_INLINE vuint32m1x8_t
+unaligned_vlsseg8e32_v_u32m1x8(const void *src, size_t vl)
+{
+  const byte *bsrc = src;
+  vuint16m2_t indexes;
+  vuint8m1_t b0, b1, b2, b3, b4, b5, b6, b7;
+  vuint32m1x8_t data;
+
+  if (LIKELY(((uintptr_t)src & 3) == 0))
+    {
+      /* Fast path for 32-bit aligned loads. */
+      return __riscv_vlsseg8e32_v_u32m1x8(src, 64, vl);
+    }
+
+  indexes = gen_indexes(4 * vl, 64);
+
+  b0 = __riscv_vluxei16_v_u8m1(bsrc + 0 * 4, indexes, vl * 4);
+  b1 = __riscv_vluxei16_v_u8m1(bsrc + 1 * 4, indexes, vl * 4);
+  b2 = __riscv_vluxei16_v_u8m1(bsrc + 2 * 4, indexes, vl * 4);
+  b3 = __riscv_vluxei16_v_u8m1(bsrc + 3 * 4, indexes, vl * 4);
+  b4 = __riscv_vluxei16_v_u8m1(bsrc + 4 * 4, indexes, vl * 4);
+  b5 = __riscv_vluxei16_v_u8m1(bsrc + 5 * 4, indexes, vl * 4);
+  b6 = __riscv_vluxei16_v_u8m1(bsrc + 6 * 4, indexes, vl * 4);
+  b7 = __riscv_vluxei16_v_u8m1(bsrc + 7 * 4, indexes, vl * 4);
+
+  data = __riscv_vundefined_u32m1x8();
+  data = __riscv_vset_v_u32m1_u32m1x8(
+           data, 0, __riscv_vreinterpret_v_u8m1_u32m1(b0));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+           data, 1, __riscv_vreinterpret_v_u8m1_u32m1(b1));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+           data, 2, __riscv_vreinterpret_v_u8m1_u32m1(b2));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+           data, 3, __riscv_vreinterpret_v_u8m1_u32m1(b3));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+           data, 4, __riscv_vreinterpret_v_u8m1_u32m1(b4));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+           data, 5, __riscv_vreinterpret_v_u8m1_u32m1(b5));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+           data, 6, __riscv_vreinterpret_v_u8m1_u32m1(b6));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+           data, 7, __riscv_vreinterpret_v_u8m1_u32m1(b7));
+
+  return data;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+unaligned_vssseg8e32_v_u32m1x8(void *dst, vuint32m1x8_t data, size_t vl)
+{
+  byte *bdst = dst;
+  vuint16m2_t indexes;
+  vuint8m1_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+  if (LIKELY(((uintptr_t)dst & 3) == 0))
+    {
+      /* Fast path for 32-bit aligned stores. */
+      __riscv_vssseg8e32_v_u32m1x8(dst, 64, data, vl);
+      return;
+    }
+
+  indexes = gen_indexes(4 * vl, 64);
+
+  b0 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 
0));
+  b1 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 
1));
+  b2 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 
2));
+  b3 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 
3));
+  b4 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 
4));
+  b5 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 
5));
+  b6 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 
6));
+  b7 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 
7));
+
+  __riscv_vsuxei16_v_u8m1(bdst + 0 * 4, indexes, b0, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 1 * 4, indexes, b1, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 2 * 4, indexes, b2, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 3 * 4, indexes, b3, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 4 * 4, indexes, b4, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 5 * 4, indexes, b5, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 6 * 4, indexes, b6, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 7 * 4, indexes, b7, vl * 4);
+}
+
+static ASM_FUNC_ATTR_INLINE unsigned int
+chacha20_rvv_blocks(u32 *input, byte *dst, const byte *src, size_t nblks)
+{
+  unsigned int i;
+
+  if (nblks == 0)
+    return 0;
+
+  /* Try use vector implementation when there is 4 or more blocks. */
+  if (nblks >= 4)
+    {
+      size_t vl = __riscv_vsetvl_e32m1(nblks) < 4
+                   ? __riscv_vsetvl_e32m1(4) : __riscv_vsetvl_e32m1(nblks);
+      vuint32m1_t x0, x1, x2, x3, x4, x5, x6, x7;
+      vuint32m1_t x8, x9, x10, x11, x12, x13, x14, x15;
+      u32 s0, s1, s2, s3, s4, s5, s6, s7;
+      u32 s8, s9, s10, s11, s12, s13, s14, s15;
+      vuint16m1_t rot16 = gen_rot16(vl);
+      vuint8m1_t rot8 = gen_rot8(vl);
+
+      s0 = input[0];
+      s1 = input[1];
+      s2 = input[2];
+      s3 = input[3];
+      s4 = input[4];
+      s5 = input[5];
+      s6 = input[6];
+      s7 = input[7];
+      s8 = input[8];
+      s9 = input[9];
+      s10 = input[10];
+      s11 = input[11];
+      s12 = input[12];
+      s13 = input[13];
+      s14 = input[14];
+      s15 = input[15];
+
+      while (nblks >= 4)
+       {
+         vuint32m1_t ctr;
+         vbool32_t carry;
+         vuint32m1x8_t data;
+
+         if (vl < 4)
+           break;
+
+         x0 = __riscv_vmv_v_x_u32m1(s0, vl);
+         x1 = __riscv_vmv_v_x_u32m1(s1, vl);
+         x2 = __riscv_vmv_v_x_u32m1(s2, vl);
+         x3 = __riscv_vmv_v_x_u32m1(s3, vl);
+         x4 = __riscv_vmv_v_x_u32m1(s4, vl);
+         x5 = __riscv_vmv_v_x_u32m1(s5, vl);
+         x6 = __riscv_vmv_v_x_u32m1(s6, vl);
+         x7 = __riscv_vmv_v_x_u32m1(s7, vl);
+         x8 = __riscv_vmv_v_x_u32m1(s8, vl);
+         x9 = __riscv_vmv_v_x_u32m1(s9, vl);
+         x10 = __riscv_vmv_v_x_u32m1(s10, vl);
+         x11 = __riscv_vmv_v_x_u32m1(s11, vl);
+         x13 = __riscv_vmv_v_x_u32m1(s13, vl);
+         x14 = __riscv_vmv_v_x_u32m1(s14, vl);
+         x15 = __riscv_vmv_v_x_u32m1(s15, vl);
+
+         ctr = __riscv_vid_v_u32m1(vl);
+         carry = __riscv_vmadc_vx_u32m1_b32(ctr, s12, vl);
+         ctr = __riscv_vadd_vx_u32m1(ctr, s12, vl);
+         x12 = ctr;
+         x13 = __riscv_vadc_vxm_u32m1(x13, 0, carry, vl);
+
+         for (i = 20; i > 0; i -= 2)
+           {
+             QUARTERROUND_4(x0, x4,  x8, x12,
+                            x1, x5,  x9, x13,
+                            x2, x6, x10, x14,
+                            x3, x7, x11, x15);
+             QUARTERROUND_4(x0, x5, x10, x15,
+                            x1, x6, x11, x12,
+                            x2, x7,  x8, x13,
+                            x3, x4,  x9, x14);
+           }
+
+         x0 = __riscv_vadd_vx_u32m1(x0, s0, vl);
+         x1 = __riscv_vadd_vx_u32m1(x1, s1, vl);
+         x2 = __riscv_vadd_vx_u32m1(x2, s2, vl);
+         x3 = __riscv_vadd_vx_u32m1(x3, s3, vl);
+         x4 = __riscv_vadd_vx_u32m1(x4, s4, vl);
+         x5 = __riscv_vadd_vx_u32m1(x5, s5, vl);
+         x6 = __riscv_vadd_vx_u32m1(x6, s6, vl);
+         x7 = __riscv_vadd_vx_u32m1(x7, s7, vl);
+         x8 = __riscv_vadd_vx_u32m1(x8, s8, vl);
+         x9 = __riscv_vadd_vx_u32m1(x9, s9, vl);
+         x10 = __riscv_vadd_vx_u32m1(x10, s10, vl);
+         x11 = __riscv_vadd_vx_u32m1(x11, s11, vl);
+         x12 = __riscv_vadd_vv_u32m1(x12, ctr, vl);
+         x13 = __riscv_vadc_vxm_u32m1(x13, s13, carry, vl);
+         x14 = __riscv_vadd_vx_u32m1(x14, s14, vl);
+         x15 = __riscv_vadd_vx_u32m1(x15, s15, vl);
+
+         s12 += vl;
+         s13 += s12 < vl;
+
+         data = unaligned_vlsseg8e32_v_u32m1x8((const void *)src, vl);
+
+         data = vxor_v_u32m1_u32m1x8(data, 0, x0, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 1, x1, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 2, x2, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 3, x3, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 4, x4, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 5, x5, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 6, x6, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 7, x7, vl);
+
+         unaligned_vssseg8e32_v_u32m1x8((void *)dst, data, vl);
+
+         data = unaligned_vlsseg8e32_v_u32m1x8((const void *)(src + 32), vl);
+
+         data = vxor_v_u32m1_u32m1x8(data, 0, x8, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 1, x9, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 2, x10, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 3, x11, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 4, x12, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 5, x13, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 6, x14, vl);
+         data = vxor_v_u32m1_u32m1x8(data, 7, x15, vl);
+
+         unaligned_vssseg8e32_v_u32m1x8((void *)(dst + 32), data, vl);
+
+         src += vl * 64;
+         dst += vl * 64;
+         nblks -= vl;
+         vl = __riscv_vsetvl_e32m1(nblks) < 4
+                   ? __riscv_vsetvl_e32m1(4) : __riscv_vsetvl_e32m1(nblks);
+       }
+
+      input[12] = s12;
+      input[13] = s13;
+    }
+
+  /* Use SIMD implementation for remaining blocks. */
+  if (nblks > 0)
+    {
+      static const u32 rol_const[3][4] =
+       {
+         { 1, 2, 3, 0 },
+         { 2, 3, 0, 1 },
+         { 3, 0, 1, 2 }
+       };
+      static const u32 one_u64_const[4] = { 1, 0, 0, 0 };
+      size_t vl = 4;
+      vuint32m1_t rol1, rol2, rol3;
+      vuint32m1_t one_u64;
+      vuint32m1_t v0, v1, v2, v3;
+      vuint32m1_t v4, v5, v6, v7;
+      vuint32m1_t state0, state1, state2, state3;
+      vuint8m1_t i0, i1, i2, i3;
+      vuint8m1_t i4, i5, i6, i7;
+      vuint16m1_t rot16 = gen_rot16(vl);
+      vuint8m1_t rot8 = gen_rot8(vl);
+
+      rol1 = __riscv_vle32_v_u32m1(rol_const[0], vl);
+      rol2 = __riscv_vle32_v_u32m1(rol_const[1], vl);
+      rol3 = __riscv_vle32_v_u32m1(rol_const[2], vl);
+      one_u64 = __riscv_vle32_v_u32m1(one_u64_const, vl);
+
+      state0 = __riscv_vle32_v_u32m1(&input[0], vl);
+      state1 = __riscv_vle32_v_u32m1(&input[4], vl);
+      state2 = __riscv_vle32_v_u32m1(&input[8], vl);
+      state3 = __riscv_vle32_v_u32m1(&input[12], vl);
+
+      input[12] += nblks;
+      input[13] += input[12] < nblks;
+
+      /* SIMD 2x block implementation */
+      while (nblks >= 2)
+       {
+         v0 = state0;
+         v1 = state1;
+         v2 = state2;
+         v3 = state3;
+
+         v4 = state0;
+         v5 = state1;
+         v6 = state2;
+         v7 = state3;
+         v7 = ADD_U64(v7, one_u64);
+
+         i0 = __riscv_vle8_v_u8m1(src + 0 * 16, vl * 4);
+         i1 = __riscv_vle8_v_u8m1(src + 1 * 16, vl * 4);
+         i2 = __riscv_vle8_v_u8m1(src + 2 * 16, vl * 4);
+         i3 = __riscv_vle8_v_u8m1(src + 3 * 16, vl * 4);
+
+         for (i = 20; i > 0; i -= 2)
+           {
+             QUARTERROUND4_2(v0, v1, v2, v3, v4, v5, v6, v7, 1, 2, 3);
+             QUARTERROUND4_2(v0, v1, v2, v3, v4, v5, v6, v7, 3, 2, 1);
+           }
+
+         v0 = __riscv_vadd_vv_u32m1(v0, state0, vl);
+         v1 = __riscv_vadd_vv_u32m1(v1, state1, vl);
+         v2 = __riscv_vadd_vv_u32m1(v2, state2, vl);
+         v3 = __riscv_vadd_vv_u32m1(v3, state3, vl);
+         state3 = ADD_U64(state3, one_u64);
+
+         v0 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i0),
+                                    v0, vl);
+         v1 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i1),
+                                    v1, vl);
+         v2 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i2),
+                                    v2, vl);
+         v3 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i3),
+                                    v3, vl);
+
+         v4 = __riscv_vadd_vv_u32m1(v4, state0, vl);
+         v5 = __riscv_vadd_vv_u32m1(v5, state1, vl);
+         v6 = __riscv_vadd_vv_u32m1(v6, state2, vl);
+         v7 = __riscv_vadd_vv_u32m1(v7, state3, vl);
+         state3 = ADD_U64(state3, one_u64);
+
+         i4 = __riscv_vle8_v_u8m1(src + 4 * 16, vl * 4);
+         i5 = __riscv_vle8_v_u8m1(src + 5 * 16, vl * 4);
+         i6 = __riscv_vle8_v_u8m1(src + 6 * 16, vl * 4);
+         i7 = __riscv_vle8_v_u8m1(src + 7 * 16, vl * 4);
+
+         __riscv_vse8_v_u8m1(dst + 0 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v0), vl * 4);
+         __riscv_vse8_v_u8m1(dst + 1 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v1), vl * 4);
+         __riscv_vse8_v_u8m1(dst + 2 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v2), vl * 4);
+         __riscv_vse8_v_u8m1(dst + 3 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v3), vl * 4);
+
+         v4 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i4),
+                                    v4, vl);
+         v5 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i5),
+                                    v5, vl);
+         v6 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i6),
+                                    v6, vl);
+         v7 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i7),
+                                    v7, vl);
+
+         __riscv_vse8_v_u8m1(dst + 4 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v4), vl * 4);
+         __riscv_vse8_v_u8m1(dst + 5 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v5), vl * 4);
+         __riscv_vse8_v_u8m1(dst + 6 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v6), vl * 4);
+         __riscv_vse8_v_u8m1(dst + 7 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v7), vl * 4);
+
+         src += 2 * 64;
+         dst += 2 * 64;
+
+         nblks -= 2;
+       }
+
+      /* 1x block implementation */
+      while (nblks)
+       {
+         v0 = state0;
+         v1 = state1;
+         v2 = state2;
+         v3 = state3;
+
+         i0 = __riscv_vle8_v_u8m1(src + 0 * 16, vl * 4);
+         i1 = __riscv_vle8_v_u8m1(src + 1 * 16, vl * 4);
+         i2 = __riscv_vle8_v_u8m1(src + 2 * 16, vl * 4);
+         i3 = __riscv_vle8_v_u8m1(src + 3 * 16, vl * 4);
+
+         for (i = 20; i > 0; i -= 2)
+           {
+             QUARTERROUND4(v0, v1, v2, v3, 1, 2, 3);
+             QUARTERROUND4(v0, v1, v2, v3, 3, 2, 1);
+           }
+
+         v0 = __riscv_vadd_vv_u32m1(v0, state0, vl);
+         v1 = __riscv_vadd_vv_u32m1(v1, state1, vl);
+         v2 = __riscv_vadd_vv_u32m1(v2, state2, vl);
+         v3 = __riscv_vadd_vv_u32m1(v3, state3, vl);
+
+         state3 = ADD_U64(state3, one_u64);
+
+         v0 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i0),
+                                    v0, vl);
+         v1 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i1),
+                                    v1, vl);
+         v2 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i2),
+                                    v2, vl);
+         v3 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i3),
+                                    v3, vl);
+         __riscv_vse8_v_u8m1(dst + 0 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v0), vl * 4);
+         __riscv_vse8_v_u8m1(dst + 1 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v1), vl * 4);
+         __riscv_vse8_v_u8m1(dst + 2 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v2), vl * 4);
+         __riscv_vse8_v_u8m1(dst + 3 * 16,
+                             __riscv_vreinterpret_v_u32m1_u8m1(v3), vl * 4);
+         src += 64;
+         dst += 64;
+
+         nblks--;
+       }
+    }
+
+  clear_vec_regs();
+
+  return 0;
+}
+
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_OPT_O2
+_gcry_chacha20_riscv_v_blocks(u32 *state, byte *dst, const byte *src,
+                             size_t nblks)
+{
+  return chacha20_rvv_blocks(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_OPT_O2
+_gcry_chacha20_riscv_v_check_hw(void)
+{
+  return (__riscv_vsetvl_e8m1(16) == 16);
+}
+
+#endif /* HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS */
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index ca8176f4..8b547db3 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -113,6 +113,12 @@
 # endif /* USE_S390X_VX */
 #endif
 
+/* USE_RISCV_V indicates whether to enable RISC-V vector extension code. */
+#undef USE_RISCV_V
+#if defined (__riscv) && defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS)
+# define USE_RISCV_V 1
+#endif
+
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
@@ -137,6 +143,7 @@ typedef struct CHACHA20_context_s
   unsigned int use_p9:1;
   unsigned int use_p10:1;
   unsigned int use_s390x:1;
+  unsigned int use_riscv_v:1;
 } CHACHA20_context_t;
 
 
@@ -259,6 +266,16 @@ unsigned int _gcry_chacha20_poly1305_aarch64_blocks4(
 
 #endif /* USE_AARCH64_SIMD */
 
+#ifdef USE_RISCV_V
+
+unsigned int _gcry_chacha20_riscv_v_blocks(u32 *state, byte *dst,
+                                          const byte *src,
+                                          size_t nblks);
+
+unsigned int _gcry_chacha20_riscv_v_check_hw(void);
+
+#endif /* USE_RISCV_V */
+
 
 static const char *selftest (void);
 
@@ -396,6 +413,13 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const 
byte *src,
     }
 #endif
 
+#ifdef USE_RISCV_V
+  if (ctx->use_riscv_v)
+    {
+      return _gcry_chacha20_riscv_v_blocks(ctx->input, dst, src, nblks);
+    }
+#endif
+
   return do_chacha20_blocks (ctx->input, dst, src, nblks);
 }
 
@@ -538,6 +562,11 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #ifdef USE_S390X_VX
   ctx->use_s390x = (features & HWF_S390X_VX) != 0;
 #endif
+#ifdef USE_RISCV_V
+  ctx->use_riscv_v = (features & HWF_RISCV_IMAFDC)
+                    && (features & HWF_RISCV_V)
+                    && _gcry_chacha20_riscv_v_check_hw();
+#endif
 
   (void)features;
 
diff --git a/configure.ac b/configure.ac
index fbe82695..4e9f1754 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3510,6 +3510,10 @@ if test "$found" = "1" ; then
          # Build with the s390x/zSeries vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-s390x.lo"
       ;;
+      riscv64-*-*)
+         # Build with the RISC-V vector implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-riscv-v.lo"
+      ;;
    esac
 fi
 
-- 
2.45.2


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel

[PATCH 5/6] chacha20: add RISC-V vector intrinsics implementation

Reply via email to