Hello,

Thank you for the suggestions and sorry for the shameless delay.

ni...@lysator.liu.se (Niels Möller) writes:

>> Also, optimized implementation of modular reduction is currently
>> missing, which is beyond my expertise.  I would appreciate any
>> suggestions regarding that.
>
> If we do Euclidean reduction, we should use the property that
>
>   2^448 = 2^224 + 1 (mod)
>
> And we'd need to use this twice to reduce a 896-bit product to 448 bits.
> On 64-bit machines, we'll get some shifting since 224 isn't a multiple
> of 64.

Due to my ignorance, I probably don't get what you mean, but as far as I
read the implementations of other curves in Nettle, some of them seem to
use the property of generalized Mersenne numbers described in:
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.46.2133
and it's also applicable to the Curve448 prime.

So a 896-bit product here:

  a0 + a1 * 2^k + a2 * 2^2k + a3 * 2^3k (k = 224)

can be reduced to:

  b0 + b1 * 2^k (mod p)

through modular additions of the smaller numbers:

  b0 = a0 + a2 + a3 (mod p)
  b1 = a1 + a2 + a3 + a3 (mod p)

I tried to implement it (as attached) and got 20-30% speed-up with
mini-gmp.

size   modp reduce   modq modinv mi_gcd mi_pow dup_jj ad_jja ad_hhh  mul_g  
mul_a (us)
 192 0.0066 0.0061 0.0770  28.90  0.000   0.00  0.698  0.811  1.014   44.2  
188.5
 224 0.0091 0.0091 0.1231  47.57  0.000   0.00  0.969  1.285  1.566   90.2  
329.1
 255 0.0081 0.0078 0.1550  18.70  0.000   0.00  0.767  1.014  1.082   88.7  
278.4
 256 0.1288 0.0115 0.1464  50.22  0.000   0.00  1.004  1.457  1.701  105.6  
380.1
 384 0.0170 0.0170 0.1260 106.51  0.000   0.00  1.791  2.240  3.113  305.9 
1015.7
 448 0.1892 0.1886 0.1886 171.57  0.000   0.00  3.630  4.914  4.940  842.1 
1966.8
 521 0.0163 0.0158 0.2821 217.44  0.000   0.00  3.555  4.507  6.107  788.8 
2637.8

size   modp reduce   modq modinv mi_gcd mi_pow dup_jj ad_jja ad_hhh  mul_g  
mul_a (us)
 192 0.0080 0.0082 0.0748  30.61  0.000   0.00  0.663  0.810  1.007   43.9  
189.9
 224 0.0103 0.0108 0.1281  47.48  0.000   0.00  0.968  1.481  1.632   89.4  
323.2
 255 0.0095 0.0090 0.1545  18.71  0.000   0.00  0.828  1.081  1.085   90.8  
284.9
 256 0.1353 0.0123 0.1423  51.25  0.000   0.00  0.992  1.326  1.726  107.4  
379.3
 384 0.0174 0.0171 0.1253 108.82  0.000   0.00  1.760  2.233  2.991  302.9 
1004.2
 448 0.1382 0.1380 0.1865 147.87  0.000   0.00  3.167  4.161  4.211  738.6 
1668.3
 521 0.0186 0.0179 0.2750 216.30  0.000   0.00  3.438  4.249  5.756  743.5 
2510.6

Do you think this is acceptable or need further optimization?

Regards,
-- 
Daiki Ueno
>From bc210d94144d6b5d1e050f78b7261d58a58d8f19 Mon Sep 17 00:00:00 2001
From: Daiki Ueno <du...@redhat.com>
Date: Tue, 9 Jan 2018 16:13:53 +0100
Subject: [PATCH] ecc: Add optimized modp implementation for curve448

Signed-off-by: Daiki Ueno <du...@redhat.com>
---
 ecc-448.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 70 insertions(+), 2 deletions(-)

diff --git a/ecc-448.c b/ecc-448.c
index f98616d8..d62e8e1b 100644
--- a/ecc-448.c
+++ b/ecc-448.c
@@ -45,6 +45,74 @@
 
 #include "ecc-448.h"
 
+#if HAVE_NATIVE_ecc_448_modp
+
+#define ecc_448_modp nettle_ecc_448_modp
+void
+ecc_448_modp (const struct ecc_modulo *m, mp_limb_t *rp);
+
+#elif GMP_NUMB_BITS == 64
+/*
+ * Use 2^448 = 2^224 + 1 (mod p), compute:
+ *   b0 + b1 * 2^224 = a0 + a1 * 2^224 + a2 * 2^448 + a3 * 2^672 (mod p)
+ *
+ * as p is considered as a generalized Mersenne number, b0 and b1 can
+ * be calculated with the following shortcut:
+ *
+ *   b0 = a0 + a2 + a3 (mod p)
+ *   b1 = a1 + a2 + a3 + a3 (mod p)
+ */
+static void
+ecc_448_modp (const struct ecc_modulo *m, mp_limb_t *rp)
+{
+  mp_limb_t a1[4], a3[4], cy, mask;
+
+  /* Extract a1 */
+  mpn_copyi (a1, rp + 3, 4);
+  mpn_rshift (a1, a1, 4, 32);
+  rp[3] &= 0xFFFFFFFF;
+
+  /* Extract a3 */
+  mpn_copyi (a3, rp + 10, 4);
+  mpn_rshift (a3, a3, 4, 32);
+  rp[10] &= 0xFFFFFFFF;
+
+  /* a2 = a2 + a3 */
+  mpn_add_n (rp + 7, rp + 7, a3, 4);
+
+  /* a0 = a0 + a2 (carry propagated to a2) */
+  mpn_add_n (rp, rp, rp + 7, 4);
+  mpn_rshift (&cy, rp + 3, 1, 32);
+  sec_add_1 (rp + 7, rp + 7, 4, cy);
+
+  /* a2 = a2 + a1 */
+  mpn_add_n (rp + 7, rp + 7, a1, 4);
+
+  /* a2 = a2 + a3 */
+  mpn_add_n (rp + 7, rp + 7, a3, 4);
+  mpn_rshift (&cy, rp + 10, 1, 32);
+
+  /* Move a2 next to a0 */
+  rp[6] = 0;
+  mpn_lshift (rp + 6, rp + 7, 4, 32);
+  rp[3] &= 0xFFFFFFFF;
+  rp[3] |= rp[6];
+  mpn_copyi (rp + 4, rp + 7, 3);
+
+  assert (cy <= 3);
+  mask = (1 << cy) - 1;
+  cy = cnd_add_n (mask & 1, rp, ecc_Bmodp, 7);
+  assert (cy == 0);
+  cy = cnd_add_n (mask & 2, rp, ecc_Bmodp, 7);
+  assert (cy == 0);
+  cy = cnd_add_n (mask & 4, rp, ecc_Bmodp, 7);
+  assert (cy == 0);
+}
+
+#else
+#define ecc_448_modp ecc_mod
+#endif
+
 /* Needs 2*ecc->size limbs at rp, and 2*ecc->size additional limbs of
    scratch space. No overlap allowed. */
 static void
@@ -221,8 +289,8 @@ const struct ecc_curve _nettle_curve448 =
     NULL,
     ecc_pp1h,
 
-    ecc_mod,	      /* FIXME: Implement optimized mod function */
-    ecc_mod,	      /* FIXME: Implement optimized reduce function */
+    ecc_448_modp,
+    ecc_448_modp,
     ecc_448_inv,
     ecc_448_sqrt,
   },
-- 
2.13.6

_______________________________________________
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

Reply via email to