On Sat, Oct 15, 2022 at 9:37 AM Niels Möller <ni...@lysator.liu.se> wrote:

> Maamoun TK <maamoun...@googlemail.com> writes:
>
> > I updated https://git.lysator.liu.se/nettle/nettle/-/merge_requests/48
> to
> > have the logic of processing partial blocks in C files.
>
> I was thinking of something like
>
> const uint8_t *
> _nettle_poly1305_update(struct poly1305_ctx*,
>                         size_t blocks, const uint8_t *data);
>
> as the interface to assembly implementations. May need some additional
> argument to be able to handle last partial block (I think it's fine to
> have the main case handle only full blocks, i.e., the t4 == 1 case if I
> remember correctly), but we need *some* way to also deal with t4 == 0
> for the final block, that could be an additional t4 argument applied
> only to the last block processed, or possibly a separate entry point. Or
> even some other hack, like a special value for blocks (0 or ~0) meaning
> thet there's only a single block to process, and that t4 = 0, not sure
> what's the best design.
>

I think the design could be as simple as always padding each block with
0x01 in _nettle_poly1305_update while keeping _nettle_poly1305_block that
is responsible for processing last block takes variable padding values (0
or 1). I committed an update in
https://git.lysator.liu.se/nettle/nettle/-/merge_requests/48 that applies
that design.

> Let me give it a shot on my own during weekend.
>
> Great, please do. I had a quick look yesterday, let me share my start.
> The parts that are potentially useful for you are the configure test and
> related logic for __int128 (the only part of the patch actually tested),
> and the reorganization of fields in poly1305_ctx (without changing size
> or alignment, at least that's the intention). See below.
>

I've implemented Poly1305 based on 2^44 in C and got the following
benchmark results of update function tested on Core i5-10300H
*-------------------------------------------------------------------*
|  Radix                                            |      Mbyte/s       |
|----------------------------------------------|---------------------|
|  2^26  (C)                                       |         1397.53   |
|  2^44  (C)                                       |         1734.09   |
|  2^64  (C)                                       |         2645.74   |
|  2^64  (C deferred partial folding)  |         3048.93   |
|  2^64  (Assembly - latest)              |         4001.14   |
*-------------------------------------------------------------------*
I used two versions of 2^64 based C-implementations you shared earlier for
this benchmark. As it stands, Poly1305 based on 2^64 with deferred partial
folding performs better compared with other C implementations for single
block processing.

I will attach a working implementation of radix 2^44 below.

regards,
Mamone

diff --git a/configure.ac b/configure.ac
index 59f68b00..55186127 100644
--- a/configure.ac
+++ b/configure.ac
@@ -242,6 +242,22 @@ if test "x$nettle_cv_c_builtin_bswap64" = "xyes" ; then
   AC_DEFINE(HAVE_BUILTIN_BSWAP64)
 fi

+AC_CACHE_CHECK([for __int128],
+              nettle_cv_c_int128,
+[AC_TRY_LINK([
+#include <stdint.h>
+],[
+uint64_t x = 1;
+uint64_t y = 2;
+unsigned __int128 z = x * (unsigned __int128) y;
+],
+nettle_cv_c_int128=yes,
+nettle_cv_c_int128=no)])
+
+AH_TEMPLATE([HAVE_TYPE_INT128], [Define if __int128 type is available])
+if test "$nettle_cv_c_int128" = "yes" ; then
+  AC_DEFINE(HAVE_TYPE_INT128)
+fi
 LSH_GCC_ATTRIBUTES

 # Check for file locking. We (AC_PROG_CC?) have already checked for
diff --git a/poly1305-internal.c b/poly1305-internal.c
index 380b934e..1b4bf46e 100644
--- a/poly1305-internal.c
+++ b/poly1305-internal.c
@@ -67,24 +67,6 @@

 #include "macros.h"

-#define mul32x32_64(a,b) ((uint64_t)(a) * (b))
-
-#define r0 r.r32[0]
-#define r1 r.r32[1]
-#define r2 r.r32[2]
-#define r3 r.r32[3]
-#define r4 r.r32[4]
-#define s1 r.r32[5]
-#define s2 s32[0]
-#define s3 s32[1]
-#define s4 s32[2]
-
-#define h0 h.h32[0]
-#define h1 h.h32[1]
-#define h2 h.h32[2]
-#define h3 h.h32[3]
-#define h4 hh
-
 /* For fat builds */
 #if HAVE_NATIVE_poly1305_set_key
 void
@@ -107,6 +89,122 @@ _nettle_poly1305_digest_c(struct poly1305_ctx *ctx,
 # define _nettle_poly1305_digest _nettle_poly1305_digest_c
 #endif

+#if 1 && HAVE_NATIVE_64_BIT && HAVE_TYPE_INT128
+#define mul64x64_128(a,b) ((unsigned __int128) (a) * (b))
+#define r0 r.r64[0]
+#define r1 r.r64[1]
+#define r2 r.r64[2]
+#define s1 r.r64[3]
+#define hh0 r.r64[4]
+#define hh1 r.r64[5]
+#define hh2 r.r64[6]
+
+void
+_nettle_poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
+{
+  uint64_t t0,t1;
+
+  t0 = LE_READ_UINT64(key);
+  t1 = LE_READ_UINT64(key+8);
+
+  ctx->r0 = 0xffc0fffffff & t0;
+  ctx->r1 = 0xfffffc0ffff & ((t0 >> 44) | (t1 << 20));
+  ctx->r2 = 0x00ffffffc0f & (t1 >> 24);
+  ctx->s1 = 20 * ctx->r1;
+
+  ctx->hh0 = 0;
+  ctx->hh1 = 0;
+  ctx->hh2 = 0;
+}
+
+void
+_nettle_poly1305_block (struct poly1305_ctx *ctx, const uint8_t *m,
unsigned t4)
+{
+  uint64_t t0, t1, s2, h0, h1, h2;
+  unsigned __int128 p0, p1, p2;
+  t0 = LE_READ_UINT64(m);
+  t1 = LE_READ_UINT64(m+8);
+
+  h0 = ctx->hh0 + (0xfffffffffff & t0);
+  h1 = ctx->hh1 + (0xfffffffffff & ((t0 >> 44) | (t1 << 20)));
+  h2 = ctx->hh2 + ((t1 >> 24) | ((uint64_t) t4 << 40));
+
+  s2 = 20 * ctx->r2;
+  p0 = mul64x64_128 (h0, ctx->r0) + mul64x64_128 (h1, s2) + mul64x64_128
(h2, ctx->s1);
+  p1 = mul64x64_128 (h0, ctx->r1) + mul64x64_128 (h1, ctx->r0) +
mul64x64_128 (h2, s2);
+  p2 = mul64x64_128 (h0, ctx->r2) + mul64x64_128 (h1, ctx->r1) +
mul64x64_128 (h2, ctx->r0);
+
+  p1 += p0 >> 44;
+  p2 += p1 >> 44;
+  ctx->hh2 = 0x3ffffffffff & p2;
+  p2 >>= 42;
+  p0 = (p0 & 0xfffffffffff) + (p2 << 2) + p2;
+  ctx->hh0 = 0xfffffffffff & p0;
+  ctx->hh1 = (0xfffffffffff & p1) + (p0 >> 44);
+}
+
+/* Adds digest to the nonce */
+void
+_nettle_poly1305_digest (struct poly1305_ctx *ctx, union nettle_block16 *s)
+{
+  uint64_t t0, t1, t2, c1, mask, m0;
+  t0 = ctx->hh0;
+  t1 = ctx->hh1;
+  t2 = ctx->hh2;
+
+  /* Moving carry */
+  t2 += t1 >> 44;
+  t0 += (t2 >> 42) * 5;
+  t1 = (t1 & 0xfffffffffff) + (t0 >> 44);
+  t0 &= 0xfffffffffff;
+  t2 &= 0x3ffffffffff;
+
+  /* Convert state of radix 2^44 to 2^64 */
+  t0 |= t1 << 44;
+  t1 = (t1 >> 20) | (t2 << 24);
+  t2 >>= 40;
+
+  /* Compute resulting carries when adding 5. */
+  c1 = t0 >= -(UINT64_C(5));
+  t2 += (t1 + c1 < c1);
+
+  /* Set if H >= 2^130 - 5 */
+  mask = - (t2 >> 2);
+
+  t0 += mask & 5;
+  t1 += mask & c1;
+
+  /* FIXME: Take advantage of s being aligned as an unsigned long. */
+  m0 = t0 + LE_READ_UINT64(s->b);
+  t1 += (m0 < t0) + LE_READ_UINT64(s->b+8);
+
+  LE_WRITE_UINT64(s->b, m0);
+  LE_WRITE_UINT64(s->b+8, t1);
+
+  ctx->hh0 = 0;
+  ctx->hh1 = 0;
+  ctx->hh2 = 0;
+}
+#else
+
+#define mul32x32_64(a,b) ((uint64_t)(a) * (b))
+
+#define r0 r.r32[0]
+#define r1 r.r32[1]
+#define r2 r.r32[2]
+#define r3 r.r32[3]
+#define r4 r.r32[4]
+#define s1 r.r32[5]
+#define s2 r.r32[6]
+#define s3 r.r32[7]
+#define s4 r.r32[8]
+
+#define h0 r.r32[10]
+#define h1 r.r32[11]
+#define h2 r.r32[12]
+#define h3 r.r32[13]
+#define h4 r.r32[9]
+
 void
 _nettle_poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
 {
@@ -219,3 +317,4 @@ _nettle_poly1305_digest (struct poly1305_ctx *ctx,
union nettle_block16 *s)
   ctx->h3 = 0;
   ctx->h4 = 0;
 }
+#endif
diff --git a/poly1305.h b/poly1305.h
index 99c63c8a..659fb11f 100644
--- a/poly1305.h
+++ b/poly1305.h
@@ -52,22 +52,16 @@ extern "C" {
 #define POLY1305_BLOCK_SIZE 16

 struct poly1305_ctx {
-  /* Key, 128-bit value and some cached multiples. */
+  /* For base 2^26: key r32[0,...,4], extra multiples r32[5,...,8], state
r32[9,...,13].
+     For base 2^64: key r64[0,1], extra multiples r64[2,3], state
r64[4,5,6].
+     For base 2^44: key r64[0,1,2], extra multiple r64[3], state r64[4,5,6]
+       (would like space for onn more multiple).
+  */
   union
   {
-    uint32_t r32[6];
-    uint64_t r64[3];
+    uint32_t r32[14];
+    uint64_t r64[7];
   } r;
-  uint32_t s32[3];
-  /* State, represented as words of 26, 32 or 64 bits, depending on
-     implementation. */
-  /* High bits first, to maintain alignment. */
-  uint32_t hh;
-  union
-  {
-    uint32_t h32[4];
-    uint64_t h64[2];
-  } h;
 };

 /* poly1305-aes */

Regards,
> /Niels
>
> diff --git a/configure.ac b/configure.ac
> index 59f68b00..8d7af818 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -242,6 +242,22 @@ if test "x$nettle_cv_c_builtin_bswap64" = "xyes" ;
> then
>    AC_DEFINE(HAVE_BUILTIN_BSWAP64)
>  fi
>
> +AC_CACHE_CHECK([for __int128],
> +              nettle_cv_c_int128,
> +[AC_TRY_LINK([
> +#include <stdint.h>
> +],[
> +uint64_t x = 1;
> +uint64_t y = 2;
> +unsigned __int128 z = x * (unsigned __int128) y;
> +],
> +nettle_cv_c_int128=yes,
> +nettle_cv_c_int128=no)])
> +
> +AH_TEMPLATE([HAVE_TYPE_INT128], [Define if __int128 type is available])
> +if test "$nettle_cv_c_int128" = "yes" ; then
> +  AC_DEFINE(HAVE_TYPE_INT128)
> +fi
>  LSH_GCC_ATTRIBUTES
>
>  # Check for file locking. We (AC_PROG_CC?) have already checked for
> diff --git a/poly1305-internal.c b/poly1305-internal.c
> index 380b934e..6c9330b7 100644
> --- a/poly1305-internal.c
> +++ b/poly1305-internal.c
> @@ -67,24 +67,6 @@
>
>  #include "macros.h"
>
> -#define mul32x32_64(a,b) ((uint64_t)(a) * (b))
> -
> -#define r0 r.r32[0]
> -#define r1 r.r32[1]
> -#define r2 r.r32[2]
> -#define r3 r.r32[3]
> -#define r4 r.r32[4]
> -#define s1 r.r32[5]
> -#define s2 s32[0]
> -#define s3 s32[1]
> -#define s4 s32[2]
> -
> -#define h0 h.h32[0]
> -#define h1 h.h32[1]
> -#define h2 h.h32[2]
> -#define h3 h.h32[3]
> -#define h4 hh
> -
>  /* For fat builds */
>  #if HAVE_NATIVE_poly1305_set_key
>  void
> @@ -107,6 +89,73 @@ _nettle_poly1305_digest_c(struct poly1305_ctx *ctx,
>  # define _nettle_poly1305_digest _nettle_poly1305_digest_c
>  #endif
>
> +#if 0 && HAVE_NATIVE_64_BIT && HAVE_TYPE_INT128
> +#define mul64x64_128) ((unsigned __int128) (a) * (b))
> +#define r0 r.r64[0]
> +#define r1 r.r64[1]
> +#define r2 r.r64[2]
> +#define s1 r.r64[3]
> +/* FIXME: Need abi update to fit s2 in context struct. */
> +
> +#define h0 r.r64[4]
> +#define h1 r.r64[5]
> +#define h2 r.r64[6]
> +
> +void
> +_nettle_poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
> +{
> +  uint64_t t0,t1,t2,t3;
> +
> +  t0 = LE_READ_UINT64(key);
> +  t1 = LE_READ_UINT64(key+8);
> +
> +  ctx->r0 = 0xffc0fffffff & t0;
> +  ctx->r1 = 0xfffffc0ffff & ((t0 >> 44) | (t1 << 20));
> +  ctx->r2 = 0x00ffffffc0f & (t1 >> 24);
> +  ctx->s1 = 20 * ctx->r1;
> +
> +  ctx->h0 = 0;
> +  ctx->h1 = 0;
> +  ctx->h2 = 0;
> +}
> +
> +void
> +_nettle_poly1305_block (struct poly1305_ctx *ctx, const uint8_t *m,
> unsigned t4)
> +{
> +  uint64_t t0, t1, s2, h0, h1, h2;
> +  unsigned __int128 p0, p1, p2;
> +  t0 = LE_READ_UINT64(m);
> +  t1 = LE_READ_UINT64(m+8);
> +
> +  h0 = ctx->h0 + 0xfffffffffff & t0;
> +  h1 = ctx->h1 + 0xfffffffffff & ((t0 >> 44) | (t1 << 20));
> +  g2 = ctx->h2 + (t1 >> 24);
> +
> +  s2 = 20 * ctx->r2;
> +  p0 = mul64x64 (h0, ctx->r0) + mul64x64 (h1, s2) + mul64x64 (h2,
> ctx->s1);
> +  p1 = mul64x64 (h0, ctx->r1) + mul64x64 (h1, ctx->r0) + mul64x64 (h2,
> s2);
> +  p2 = mul64x64 (h0, ctx->r2) + mul64x64 (h1, ctx->r1) + mul64x64 (h2,
> ctx->r1);
> +
> +#else
> +
> +#define mul32x32_64(a,b) ((uint64_t)(a) * (b))
> +
> +#define r0 r.r32[0]
> +#define r1 r.r32[1]
> +#define r2 r.r32[2]
> +#define r3 r.r32[3]
> +#define r4 r.r32[4]
> +#define s1 r.r32[5]
> +#define s2 r.r32[6]
> +#define s3 r.r32[7]
> +#define s4 r.r32[8]
> +
> +#define h0 r.r32[10]
> +#define h1 r.r32[11]
> +#define h2 r.r32[12]
> +#define h3 r.r32[13]
> +#define h4 r.r32[9]
> +
>  void
>  _nettle_poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
>  {
> @@ -219,3 +268,4 @@ _nettle_poly1305_digest (struct poly1305_ctx *ctx,
> union nettle_block16 *s)
>    ctx->h3 = 0;
>    ctx->h4 = 0;
>  }
> +#endif
> diff --git a/poly1305.h b/poly1305.h
> index 99c63c8a..659fb11f 100644
> --- a/poly1305.h
> +++ b/poly1305.h
> @@ -52,22 +52,16 @@ extern "C" {
>  #define POLY1305_BLOCK_SIZE 16
>
>  struct poly1305_ctx {
> -  /* Key, 128-bit value and some cached multiples. */
> +  /* For base 2^26: key r32[0,...,4], extra multiples r32[5,...,8], state
> r32[9,...,13].
> +     For base 2^64: key r64[0,1], extra multiples r64[2,3], state
> r64[4,5,6].
> +     For base 2^44: key r64[0,1,2], extra multiple r64[3], state
> r64[4,5,6]
> +       (would like space for onn more multiple).
> +  */
>    union
>    {
> -    uint32_t r32[6];
> -    uint64_t r64[3];
> +    uint32_t r32[14];
> +    uint64_t r64[7];
>    } r;
> -  uint32_t s32[3];
> -  /* State, represented as words of 26, 32 or 64 bits, depending on
> -     implementation. */
> -  /* High bits first, to maintain alignment. */
> -  uint32_t hh;
> -  union
> -  {
> -    uint32_t h32[4];
> -    uint64_t h64[2];
> -  } h;
>  };
>
>  /* poly1305-aes */
>
>
> --
> Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
> Internet email is subject to wholesale government surveillance.
>
_______________________________________________
nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se
To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se

Reply via email to