Maamoun TK <[email protected]> writes:

>> What's the speedup you get from assembly gcm_fill? I see the C
>> implementation uses memcpy and WRITE_UINT32, and is likely significantly
>> slower than the ctr_fill16 in ctr.c. But it could be improved using
>> portable means. If done well, it should be a very small fraction of the
>> cpu time spent for gcm encryption.

> I measured the execution time of both C and altivec implementations on
> POWER8 for 32,768 blocks (512 KB), repeated 10000 times and compiled
> with -O3 gcm_fill_c() took 0.000073 seconds to execute
> gcm_fill_altivec() took 0.000019 seconds to execute As you can see,
> the function itself isn't time consuming at all and maybe optimizing
> it is not worth it,

Can you try below patch? For now, tested on little endian (x86_64) only,
and there the loop compiles to

  50:   89 c8                   mov    %ecx,%eax
  52:   4c 89 0a                mov    %r9,(%rdx)
  55:   48 83 c2 10             add    $0x10,%rdx
  59:   83 c1 01                add    $0x1,%ecx
  5c:   0f c8                   bswap  %eax
  5e:   48 c1 e0 20             shl    $0x20,%rax
  62:   4c 01 d0                add    %r10,%rax
  65:   48 89 42 f8             mov    %rax,-0x8(%rdx)
  69:   4c 39 c2                cmp    %r8,%rdx
  6c:   75 e2                   jne    50 <gcm_fill+0x20>

Should run in a few cycles per block (6 cycles assuming dual-issue,
decent out-of-order capabilities per block). I would expect unrolling,
to do multiple blocks in parallel, to give a large performance
improvement only on strict in-order processors.

> but gcm_fill is part of AES_CTR and what other
> libraries usually do is optimizing AES_CTR as a whole so I considered
> optimizing it to stay on the same track.

In Nettle, I strive to go to the extra complexity of assembler
implementation only when there's a significant performance benefit.

Regards,
/Niels

diff --git a/gcm.c b/gcm.c
index cf615daf..71e9f365 100644
--- a/gcm.c
+++ b/gcm.c
@@ -334,6 +334,46 @@ gcm_update(struct gcm_ctx *ctx, const struct gcm_key *key,
 }
 
 static nettle_fill16_func gcm_fill;
+#if WORDS_BIGENDIAN
+static void
+gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
+{
+  uint64_t hi, lo;
+  uint32_t lo;
+  size_t i;
+  hi = READ_UINT64(ctr);
+  mid = (uint64_t)READ_UINT32(ctr + 8) << 32;
+  lo = READ_UINT32(ctr + 12);
+
+  for (i = 0; i < blocks; i++)
+    {
+      buffer[i].u64[0] = hi;
+      buffer[i].u64[1] = mid + lo++;
+    }
+  WRITE_UINT32(ctr + 12, lo);
+
+}
+#elif HAVE_BUILTIN_BSWAP64
+/* Assume __builtin_bswap32 is also available */
+static void
+gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
+{
+  uint64_t hi, mid;
+  uint32_t lo;
+  size_t i;
+  hi = LE_READ_UINT64(ctr);
+  mid = LE_READ_UINT32(ctr + 8);
+  lo = READ_UINT32(ctr + 12);
+
+  for (i = 0; i < blocks; i++)
+    {
+      buffer[i].u64[0] = hi;
+      buffer[i].u64[1] = mid + ((uint64_t)__builtin_bswap32(lo) << 32);
+      lo++;
+    }
+  WRITE_UINT32(ctr + 12, lo);
+}
+#else
 static void
 gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
 {
@@ -349,6 +389,7 @@ gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 
*buffer)
 
   WRITE_UINT32(ctr + GCM_BLOCK_SIZE - 4, c);
 }
+#endif
 
 void
 gcm_encrypt (struct gcm_ctx *ctx, const struct gcm_key *key,


-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

Reply via email to