On Sat, Oct 15, 2022 at 9:37 AM Niels Möller <ni...@lysator.liu.se> wrote:
> Maamoun TK <maamoun...@googlemail.com> writes: > > > I updated https://git.lysator.liu.se/nettle/nettle/-/merge_requests/48 > to > > have the logic of processing partial blocks in C files. > > I was thinking of something like > > const uint8_t * > _nettle_poly1305_update(struct poly1305_ctx*, > size_t blocks, const uint8_t *data); > > as the interface to assembly implementations. May need some additional > argument to be able to handle last partial block (I think it's fine to > have the main case handle only full blocks, i.e., the t4 == 1 case if I > remember correctly), but we need *some* way to also deal with t4 == 0 > for the final block, that could be an additional t4 argument applied > only to the last block processed, or possibly a separate entry point. Or > even some other hack, like a special value for blocks (0 or ~0) meaning > thet there's only a single block to process, and that t4 = 0, not sure > what's the best design. > I think the design could be as simple as always padding each block with 0x01 in _nettle_poly1305_update while keeping _nettle_poly1305_block that is responsible for processing last block takes variable padding values (0 or 1). I committed an update in https://git.lysator.liu.se/nettle/nettle/-/merge_requests/48 that applies that design. > Let me give it a shot on my own during weekend. > > Great, please do. I had a quick look yesterday, let me share my start. > The parts that are potentially useful for you are the configure test and > related logic for __int128 (the only part of the patch actually tested), > and the reorganization of fields in poly1305_ctx (without changing size > or alignment, at least that's the intention). See below. > I've implemented Poly1305 based on 2^44 in C and got the following benchmark results of update function tested on Core i5-10300H *-------------------------------------------------------------------* | Radix | Mbyte/s | |----------------------------------------------|---------------------| | 2^26 (C) | 1397.53 | | 2^44 (C) | 1734.09 | | 2^64 (C) | 2645.74 | | 2^64 (C deferred partial folding) | 3048.93 | | 2^64 (Assembly - latest) | 4001.14 | *-------------------------------------------------------------------* I used two versions of 2^64 based C-implementations you shared earlier for this benchmark. As it stands, Poly1305 based on 2^64 with deferred partial folding performs better compared with other C implementations for single block processing. I will attach a working implementation of radix 2^44 below. regards, Mamone diff --git a/configure.ac b/configure.ac index 59f68b00..55186127 100644 --- a/configure.ac +++ b/configure.ac @@ -242,6 +242,22 @@ if test "x$nettle_cv_c_builtin_bswap64" = "xyes" ; then AC_DEFINE(HAVE_BUILTIN_BSWAP64) fi +AC_CACHE_CHECK([for __int128], + nettle_cv_c_int128, +[AC_TRY_LINK([ +#include <stdint.h> +],[ +uint64_t x = 1; +uint64_t y = 2; +unsigned __int128 z = x * (unsigned __int128) y; +], +nettle_cv_c_int128=yes, +nettle_cv_c_int128=no)]) + +AH_TEMPLATE([HAVE_TYPE_INT128], [Define if __int128 type is available]) +if test "$nettle_cv_c_int128" = "yes" ; then + AC_DEFINE(HAVE_TYPE_INT128) +fi LSH_GCC_ATTRIBUTES # Check for file locking. We (AC_PROG_CC?) have already checked for diff --git a/poly1305-internal.c b/poly1305-internal.c index 380b934e..1b4bf46e 100644 --- a/poly1305-internal.c +++ b/poly1305-internal.c @@ -67,24 +67,6 @@ #include "macros.h" -#define mul32x32_64(a,b) ((uint64_t)(a) * (b)) - -#define r0 r.r32[0] -#define r1 r.r32[1] -#define r2 r.r32[2] -#define r3 r.r32[3] -#define r4 r.r32[4] -#define s1 r.r32[5] -#define s2 s32[0] -#define s3 s32[1] -#define s4 s32[2] - -#define h0 h.h32[0] -#define h1 h.h32[1] -#define h2 h.h32[2] -#define h3 h.h32[3] -#define h4 hh - /* For fat builds */ #if HAVE_NATIVE_poly1305_set_key void @@ -107,6 +89,122 @@ _nettle_poly1305_digest_c(struct poly1305_ctx *ctx, # define _nettle_poly1305_digest _nettle_poly1305_digest_c #endif +#if 1 && HAVE_NATIVE_64_BIT && HAVE_TYPE_INT128 +#define mul64x64_128(a,b) ((unsigned __int128) (a) * (b)) +#define r0 r.r64[0] +#define r1 r.r64[1] +#define r2 r.r64[2] +#define s1 r.r64[3] +#define hh0 r.r64[4] +#define hh1 r.r64[5] +#define hh2 r.r64[6] + +void +_nettle_poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16]) +{ + uint64_t t0,t1; + + t0 = LE_READ_UINT64(key); + t1 = LE_READ_UINT64(key+8); + + ctx->r0 = 0xffc0fffffff & t0; + ctx->r1 = 0xfffffc0ffff & ((t0 >> 44) | (t1 << 20)); + ctx->r2 = 0x00ffffffc0f & (t1 >> 24); + ctx->s1 = 20 * ctx->r1; + + ctx->hh0 = 0; + ctx->hh1 = 0; + ctx->hh2 = 0; +} + +void +_nettle_poly1305_block (struct poly1305_ctx *ctx, const uint8_t *m, unsigned t4) +{ + uint64_t t0, t1, s2, h0, h1, h2; + unsigned __int128 p0, p1, p2; + t0 = LE_READ_UINT64(m); + t1 = LE_READ_UINT64(m+8); + + h0 = ctx->hh0 + (0xfffffffffff & t0); + h1 = ctx->hh1 + (0xfffffffffff & ((t0 >> 44) | (t1 << 20))); + h2 = ctx->hh2 + ((t1 >> 24) | ((uint64_t) t4 << 40)); + + s2 = 20 * ctx->r2; + p0 = mul64x64_128 (h0, ctx->r0) + mul64x64_128 (h1, s2) + mul64x64_128 (h2, ctx->s1); + p1 = mul64x64_128 (h0, ctx->r1) + mul64x64_128 (h1, ctx->r0) + mul64x64_128 (h2, s2); + p2 = mul64x64_128 (h0, ctx->r2) + mul64x64_128 (h1, ctx->r1) + mul64x64_128 (h2, ctx->r0); + + p1 += p0 >> 44; + p2 += p1 >> 44; + ctx->hh2 = 0x3ffffffffff & p2; + p2 >>= 42; + p0 = (p0 & 0xfffffffffff) + (p2 << 2) + p2; + ctx->hh0 = 0xfffffffffff & p0; + ctx->hh1 = (0xfffffffffff & p1) + (p0 >> 44); +} + +/* Adds digest to the nonce */ +void +_nettle_poly1305_digest (struct poly1305_ctx *ctx, union nettle_block16 *s) +{ + uint64_t t0, t1, t2, c1, mask, m0; + t0 = ctx->hh0; + t1 = ctx->hh1; + t2 = ctx->hh2; + + /* Moving carry */ + t2 += t1 >> 44; + t0 += (t2 >> 42) * 5; + t1 = (t1 & 0xfffffffffff) + (t0 >> 44); + t0 &= 0xfffffffffff; + t2 &= 0x3ffffffffff; + + /* Convert state of radix 2^44 to 2^64 */ + t0 |= t1 << 44; + t1 = (t1 >> 20) | (t2 << 24); + t2 >>= 40; + + /* Compute resulting carries when adding 5. */ + c1 = t0 >= -(UINT64_C(5)); + t2 += (t1 + c1 < c1); + + /* Set if H >= 2^130 - 5 */ + mask = - (t2 >> 2); + + t0 += mask & 5; + t1 += mask & c1; + + /* FIXME: Take advantage of s being aligned as an unsigned long. */ + m0 = t0 + LE_READ_UINT64(s->b); + t1 += (m0 < t0) + LE_READ_UINT64(s->b+8); + + LE_WRITE_UINT64(s->b, m0); + LE_WRITE_UINT64(s->b+8, t1); + + ctx->hh0 = 0; + ctx->hh1 = 0; + ctx->hh2 = 0; +} +#else + +#define mul32x32_64(a,b) ((uint64_t)(a) * (b)) + +#define r0 r.r32[0] +#define r1 r.r32[1] +#define r2 r.r32[2] +#define r3 r.r32[3] +#define r4 r.r32[4] +#define s1 r.r32[5] +#define s2 r.r32[6] +#define s3 r.r32[7] +#define s4 r.r32[8] + +#define h0 r.r32[10] +#define h1 r.r32[11] +#define h2 r.r32[12] +#define h3 r.r32[13] +#define h4 r.r32[9] + void _nettle_poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16]) { @@ -219,3 +317,4 @@ _nettle_poly1305_digest (struct poly1305_ctx *ctx, union nettle_block16 *s) ctx->h3 = 0; ctx->h4 = 0; } +#endif diff --git a/poly1305.h b/poly1305.h index 99c63c8a..659fb11f 100644 --- a/poly1305.h +++ b/poly1305.h @@ -52,22 +52,16 @@ extern "C" { #define POLY1305_BLOCK_SIZE 16 struct poly1305_ctx { - /* Key, 128-bit value and some cached multiples. */ + /* For base 2^26: key r32[0,...,4], extra multiples r32[5,...,8], state r32[9,...,13]. + For base 2^64: key r64[0,1], extra multiples r64[2,3], state r64[4,5,6]. + For base 2^44: key r64[0,1,2], extra multiple r64[3], state r64[4,5,6] + (would like space for onn more multiple). + */ union { - uint32_t r32[6]; - uint64_t r64[3]; + uint32_t r32[14]; + uint64_t r64[7]; } r; - uint32_t s32[3]; - /* State, represented as words of 26, 32 or 64 bits, depending on - implementation. */ - /* High bits first, to maintain alignment. */ - uint32_t hh; - union - { - uint32_t h32[4]; - uint64_t h64[2]; - } h; }; /* poly1305-aes */ Regards, > /Niels > > diff --git a/configure.ac b/configure.ac > index 59f68b00..8d7af818 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -242,6 +242,22 @@ if test "x$nettle_cv_c_builtin_bswap64" = "xyes" ; > then > AC_DEFINE(HAVE_BUILTIN_BSWAP64) > fi > > +AC_CACHE_CHECK([for __int128], > + nettle_cv_c_int128, > +[AC_TRY_LINK([ > +#include <stdint.h> > +],[ > +uint64_t x = 1; > +uint64_t y = 2; > +unsigned __int128 z = x * (unsigned __int128) y; > +], > +nettle_cv_c_int128=yes, > +nettle_cv_c_int128=no)]) > + > +AH_TEMPLATE([HAVE_TYPE_INT128], [Define if __int128 type is available]) > +if test "$nettle_cv_c_int128" = "yes" ; then > + AC_DEFINE(HAVE_TYPE_INT128) > +fi > LSH_GCC_ATTRIBUTES > > # Check for file locking. We (AC_PROG_CC?) have already checked for > diff --git a/poly1305-internal.c b/poly1305-internal.c > index 380b934e..6c9330b7 100644 > --- a/poly1305-internal.c > +++ b/poly1305-internal.c > @@ -67,24 +67,6 @@ > > #include "macros.h" > > -#define mul32x32_64(a,b) ((uint64_t)(a) * (b)) > - > -#define r0 r.r32[0] > -#define r1 r.r32[1] > -#define r2 r.r32[2] > -#define r3 r.r32[3] > -#define r4 r.r32[4] > -#define s1 r.r32[5] > -#define s2 s32[0] > -#define s3 s32[1] > -#define s4 s32[2] > - > -#define h0 h.h32[0] > -#define h1 h.h32[1] > -#define h2 h.h32[2] > -#define h3 h.h32[3] > -#define h4 hh > - > /* For fat builds */ > #if HAVE_NATIVE_poly1305_set_key > void > @@ -107,6 +89,73 @@ _nettle_poly1305_digest_c(struct poly1305_ctx *ctx, > # define _nettle_poly1305_digest _nettle_poly1305_digest_c > #endif > > +#if 0 && HAVE_NATIVE_64_BIT && HAVE_TYPE_INT128 > +#define mul64x64_128) ((unsigned __int128) (a) * (b)) > +#define r0 r.r64[0] > +#define r1 r.r64[1] > +#define r2 r.r64[2] > +#define s1 r.r64[3] > +/* FIXME: Need abi update to fit s2 in context struct. */ > + > +#define h0 r.r64[4] > +#define h1 r.r64[5] > +#define h2 r.r64[6] > + > +void > +_nettle_poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16]) > +{ > + uint64_t t0,t1,t2,t3; > + > + t0 = LE_READ_UINT64(key); > + t1 = LE_READ_UINT64(key+8); > + > + ctx->r0 = 0xffc0fffffff & t0; > + ctx->r1 = 0xfffffc0ffff & ((t0 >> 44) | (t1 << 20)); > + ctx->r2 = 0x00ffffffc0f & (t1 >> 24); > + ctx->s1 = 20 * ctx->r1; > + > + ctx->h0 = 0; > + ctx->h1 = 0; > + ctx->h2 = 0; > +} > + > +void > +_nettle_poly1305_block (struct poly1305_ctx *ctx, const uint8_t *m, > unsigned t4) > +{ > + uint64_t t0, t1, s2, h0, h1, h2; > + unsigned __int128 p0, p1, p2; > + t0 = LE_READ_UINT64(m); > + t1 = LE_READ_UINT64(m+8); > + > + h0 = ctx->h0 + 0xfffffffffff & t0; > + h1 = ctx->h1 + 0xfffffffffff & ((t0 >> 44) | (t1 << 20)); > + g2 = ctx->h2 + (t1 >> 24); > + > + s2 = 20 * ctx->r2; > + p0 = mul64x64 (h0, ctx->r0) + mul64x64 (h1, s2) + mul64x64 (h2, > ctx->s1); > + p1 = mul64x64 (h0, ctx->r1) + mul64x64 (h1, ctx->r0) + mul64x64 (h2, > s2); > + p2 = mul64x64 (h0, ctx->r2) + mul64x64 (h1, ctx->r1) + mul64x64 (h2, > ctx->r1); > + > +#else > + > +#define mul32x32_64(a,b) ((uint64_t)(a) * (b)) > + > +#define r0 r.r32[0] > +#define r1 r.r32[1] > +#define r2 r.r32[2] > +#define r3 r.r32[3] > +#define r4 r.r32[4] > +#define s1 r.r32[5] > +#define s2 r.r32[6] > +#define s3 r.r32[7] > +#define s4 r.r32[8] > + > +#define h0 r.r32[10] > +#define h1 r.r32[11] > +#define h2 r.r32[12] > +#define h3 r.r32[13] > +#define h4 r.r32[9] > + > void > _nettle_poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16]) > { > @@ -219,3 +268,4 @@ _nettle_poly1305_digest (struct poly1305_ctx *ctx, > union nettle_block16 *s) > ctx->h3 = 0; > ctx->h4 = 0; > } > +#endif > diff --git a/poly1305.h b/poly1305.h > index 99c63c8a..659fb11f 100644 > --- a/poly1305.h > +++ b/poly1305.h > @@ -52,22 +52,16 @@ extern "C" { > #define POLY1305_BLOCK_SIZE 16 > > struct poly1305_ctx { > - /* Key, 128-bit value and some cached multiples. */ > + /* For base 2^26: key r32[0,...,4], extra multiples r32[5,...,8], state > r32[9,...,13]. > + For base 2^64: key r64[0,1], extra multiples r64[2,3], state > r64[4,5,6]. > + For base 2^44: key r64[0,1,2], extra multiple r64[3], state > r64[4,5,6] > + (would like space for onn more multiple). > + */ > union > { > - uint32_t r32[6]; > - uint64_t r64[3]; > + uint32_t r32[14]; > + uint64_t r64[7]; > } r; > - uint32_t s32[3]; > - /* State, represented as words of 26, 32 or 64 bits, depending on > - implementation. */ > - /* High bits first, to maintain alignment. */ > - uint32_t hh; > - union > - { > - uint32_t h32[4]; > - uint64_t h64[2]; > - } h; > }; > > /* poly1305-aes */ > > > -- > Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677. > Internet email is subject to wholesale government surveillance. > _______________________________________________ nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se