Yes I've tried that too. It was the same exact speed, so I started to
profile the function, which hasn't been easy.
I think the vast majority (98%+) of the function is spent in calls to
mpz_rabinmiller.

If I don't change the small primes for trail division than the same number
of calls get made to mpz_rabinmiller
and speed should be largely unchanged (minus what is a frustrating amount
of noise in tune/speed due to randomness).

I'm working on getting a setup where I can profile tune/speed for
1. number of false calls to mpz_millerrabin
2. overall time spent in mpz_millerrabin

When I use gprof I see 98% of the cum time is in __gmpn_redc_1,
__gmpn_sqr_basecase, and __gmpn_powm
which I'm interpreting as mpz_millerrabin but I'd love to see the call flow
like with gperftool and pprof instead of
the flat profile (which is all I can get gprof to produce)



On Tue, Sep 3, 2019 at 9:44 PM Niels Möller <ni...@lysator.liu.se> wrote:

> Seth Troisi <brain...@gmail.com> writes:
>
> > The idea is to try and avoid performing modulo by storing the distance to
> > the
> > next multiple of each prime. This avoids divisions, unfortunately it
> > requires
> > writing back to the table each time a multiple is passed.
>
> Have you tried divisibility test based on 2-adic inverse? It works like
> this:
>
> Let p be an odd prime, and x any number that fits in a uint16_t (say),
> and you want to know if p divides x.
>
> Precompute
>
>   p_inv = p^-1 (mod 2^16)
>
> and
>
>   p_limit = floor (2^16 / p) = floor ( (2^16 - 1) / p)
>
> Then p divides x if and only if x * p_inv mod 2^16 <= p_limit.
>
> The reason this works is that (i) multiplication by p_inv mod 2^16 is a
> one-to-one mapping, and (ii) whenever p divides x, it agrees with normal
> division, p_inv * x mod 2^16 = x / p.
>
> Therefore all the multiples of p are mapped to the interval 0 <= x *
> p_inv mod 2^16 <= p_limit, and non-multiples are mapped to larger values.
>
> So if you tabulate p_inv and p_limit together with the prime table,
>
>   r = (moduli[i] + incr) % prime;
>   if (r == 0) goto next;
>
> could be replaced with
>
>   if ((uint16_t)((moduli[i] + incr) * p_inv) <= p_limit)
>     goto next;
>
> which a multiply instead of a division. I would expect it to be faster
> than the loop
>
>   while (incr > distance[i])
>     {
>       distance[i] += prime;
>     }
>
> even though it's a bit unclear to me how many times it typically runs
> and how well predictable the branch is.
>
> Regards,
> /Niels
>
> --
> Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
> Internet email is subject to wholesale government surveillance.
>
diff --git a/mpz/nextprime.c b/mpz/nextprime.c
index 4c5ca57..1c335a4 100644
--- a/mpz/nextprime.c
+++ b/mpz/nextprime.c
@@ -33,6 +33,12 @@ see https://www.gnu.org/licenses/.  */
 #include "gmp-impl.h"
 #include "longlong.h"
 
+#include "stdio.h"
+
+/* Maximum number of primes to test with trial division. */
+#define NUMBER_OF_PRIMES 167
+
+/* Gaps between primes (first gap is 5-3=2, last is 1009-997=12) */
 static const unsigned char primegap[] =
 {
   2,2,4,2,4,2,4,6,2,6,4,2,4,6,6,2,6,4,2,6,4,6,8,4,2,4,2,4,14,4,6,
@@ -43,7 +49,43 @@ static const unsigned char primegap[] =
   6,14,4,6,6,8,6,12
 };
 
-#define NUMBER_OF_PRIMES 167
+/* primes[] inverse (mod 2^16), excluding 2 */
+static const unsigned short primeinv[] =
+{
+  43691, 52429, 28087, 35747, 20165, 61681, 51739, 14247, 49717, 31711,
+  7085, 39961, 48771, 18127, 21021, 55539, 38677, 19563, 43383, 61945,
+  5807, 17371, 18409, 41889, 45421, 6999, 44099, 2405, 49297, 49023,
+  20011, 30137, 26403, 51901, 32551, 34229, 45835, 42775, 25381, 44667,
+  60829, 28479, 36673, 32269, 49399, 22363, 47903, 17611, 48365, 55129,
+  11791, 61457, 2611, 65281, 40119, 46533, 52719, 17981, 52009, 28947,
+  12973, 57851, 33927, 40201, 56853, 32867, 59313, 18131, 23285, 25249,
+  35415, 32143, 1757, 46515, 48767, 53069, 10565, 57201, 49833, 14859,
+  65069, 62799, 42833, 44039, 39795, 3649, 59513, 54021, 25903, 65115,
+  64031, 33239, 7875, 32571, 4039, 58197, 11321, 63907, 53301, 48523,
+  40357, 51451, 19465, 34547, 3521, 14179, 34481, 2407, 9705, 55711,
+  57197, 44505, 39491, 30535, 15745, 56363, 9015, 36933, 27547, 47293,
+  24929, 5421, 59395, 31867, 34965, 11277, 50223, 4327, 58741, 21195,
+  61655, 27663, 6493, 8009, 64769, 20941, 16155, 55093, 18713, 41859,
+  30493, 8839, 56819, 27669, 46711, 57853, 5353, 29907, 6303, 32357,
+  23953, 55739, 50759, 3107, 47983, 44071, 41057, 20633, 22565, 25467,
+  4745, 52727, 35299, 13617, 40935, 30751, 33261, 36113
+};
+
+/* 2^16 - 1 / primes[], excluding 2 */
+static const unsigned short primeb[] =
+{
+  21845, 13107, 9362, 5957, 5041, 3855, 3449, 2849, 2259, 2114, 1771,
+  1598, 1524, 1394, 1236, 1110, 1074, 978, 923, 897, 829, 789, 736, 675, 648,
+  636, 612, 601, 579, 516, 500, 478, 471, 439, 434, 417, 402, 392, 378, 366,
+  362, 343, 339, 332, 329, 310, 293, 288, 286, 281, 274, 271, 261, 255, 249,
+  243, 241, 236, 233, 231, 223, 213, 210, 209, 206, 197, 194, 188, 187, 185,
+  182, 178, 175, 172, 171, 168, 165, 163, 160, 156, 155, 152, 151, 149, 147,
+  145, 143, 142, 141, 140, 136, 134, 133, 131, 130, 128, 125, 125, 121, 119,
+  117, 116, 115, 114, 113, 111, 110, 109, 109, 107, 106, 106, 105, 103, 102,
+  101, 101, 100, 99, 99, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 88, 87, 86,
+  86, 85, 84, 83, 82, 81, 80, 79, 79, 79, 79, 78, 76, 76, 76, 75, 74, 74, 74,
+  73, 72, 71, 71, 70, 69, 69, 69, 68, 67, 67, 67, 66, 66, 65, 64
+};
 
 void
 mpz_nextprime (mpz_ptr p, mpz_srcptr n)
@@ -52,10 +94,10 @@ mpz_nextprime (mpz_ptr p, mpz_srcptr n)
   unsigned long difference;
   int i;
   unsigned prime_limit;
-  unsigned long prime;
+  unsigned short prime;
   mp_size_t pn;
   mp_bitcnt_t nbits;
-  unsigned incr;
+  unsigned short incr;
   TMP_SDECL;
 
   /* First handle tiny numbers */
@@ -64,16 +106,19 @@ mpz_nextprime (mpz_ptr p, mpz_srcptr n)
       mpz_set_ui (p, 2);
       return;
     }
+
+  /* Set p to next odd number */
   mpz_add_ui (p, n, 1);
   mpz_setbit (p, 0);
 
+
   if (mpz_cmp_ui (p, 7) <= 0)
     return;
 
   pn = SIZ(p);
   MPN_SIZEINBASE_2EXP(nbits, PTR(p), pn, 1);
   if (nbits / 2 >= NUMBER_OF_PRIMES)
-    prime_limit = NUMBER_OF_PRIMES - 1;
+    prime_limit = NUMBER_OF_PRIMES;
   else
     prime_limit = nbits / 2;
 
@@ -92,7 +137,7 @@ mpz_nextprime (mpz_ptr p, mpz_srcptr n)
 	  prime += primegap[i];
 	}
 
-#define INCR_LIMIT 0x10000	/* deep science */
+#define INCR_LIMIT 0x04000	/* deep science */
 
       for (difference = incr = 0; incr < INCR_LIMIT; difference += 2)
 	{
@@ -100,15 +145,29 @@ mpz_nextprime (mpz_ptr p, mpz_srcptr n)
 	  prime = 3;
 	  for (i = 0; i < prime_limit; i++)
 	    {
+/*
 	      unsigned r;
-	      /* FIXME: Reduce moduli + incr and store back, to allow for
-		 division-free reductions.  Alternatively, table primes[]'s
-		 inverses (mod 2^16).  */
 	      r = (moduli[i] + incr) % prime;
-	      prime += primegap[i];
-
-	      if (r == 0)
+*/
+
+              unsigned short t = moduli[i] + incr;
+              unsigned short inv = primeinv[i];
+              unsigned short temp = t * inv;
+              unsigned short b = primeb[i];
+
+              unsigned isdiv = temp <= b;
+/*
+              printf("(%d + %d) %% %ld == %d ?= %d == %d <= %d\n",
+                moduli[i], incr, prime,
+                r, isdiv,
+                temp, b);
+              ASSERT ( (r == 0) == isdiv );
+*/
+	      if (isdiv) {
+	      //if (r == 0) {
 		goto next;
+              }
+	      prime += primegap[i];
 	    }
 
 	  mpz_add_ui (p, p, difference);
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to