Below is next patch, updating the benchmarking of the toom mul functions
to have a size_ratio parameter, and use that when tuning thresholds like
MUL_TOOM32_TO_TOOM43_THRESHOLD. Seems to work fine for me, but it would
be nice if someone more could run it and check that resulting thresholds
look sane.

One thing that looks odd is

   /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */

for the tuning of MUL_TOOM43_TO_TOOM54_THRESHOLD. Ideal ratio for toom43
is 3/4 = 0.75, and ideal for toom54 is 4/5 = 0.8. And so 5/6 = 0.833...
is higher than both. I would have expected the midpoint of 3/4 and 4/5,
which would be 31/40.

Regards,
/Niels

diff -r 869c08440fbd tune/common.c
--- a/tune/common.c     Wed Nov 22 14:53:47 2023 +0100
+++ b/tune/common.c     Tue Dec 19 19:52:01 2023 +0100
@@ -1386,50 +1386,20 @@
   SPEED_ROUTINE_MPN_TOOM43_MUL (mpn_toom43_mul);
 }
 double
+speed_mpn_toom53_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM53_MUL (mpn_toom53_mul);
+}
+double
+speed_mpn_toom54_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM54_MUL (mpn_toom54_mul);
+}
+double
 speed_mpn_toom63_mul (struct speed_params *s)
 {
   SPEED_ROUTINE_MPN_TOOM63_MUL (mpn_toom63_mul);
 }
-double
-speed_mpn_toom32_for_toom43_mul (struct speed_params *s)
-{
-  SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL (mpn_toom32_mul);
-}
-double
-speed_mpn_toom43_for_toom32_mul (struct speed_params *s)
-{
-  SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL (mpn_toom43_mul);
-}
-double
-speed_mpn_toom32_for_toom53_mul (struct speed_params *s)
-{
-  SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL (mpn_toom32_mul);
-}
-double
-speed_mpn_toom53_for_toom32_mul (struct speed_params *s)
-{
-  SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL (mpn_toom53_mul);
-}
-double
-speed_mpn_toom42_for_toom53_mul (struct speed_params *s)
-{
-  SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL (mpn_toom42_mul);
-}
-double
-speed_mpn_toom53_for_toom42_mul (struct speed_params *s)
-{
-  SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL (mpn_toom53_mul);
-}
-double
-speed_mpn_toom43_for_toom54_mul (struct speed_params *s)
-{
-  SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL (mpn_toom43_mul);
-}
-double
-speed_mpn_toom54_for_toom43_mul (struct speed_params *s)
-{
-  SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL (mpn_toom54_mul);
-}
 
 double
 speed_mpn_nussbaumer_mul (struct speed_params *s)
diff -r 869c08440fbd tune/speed.c
--- a/tune/speed.c      Wed Nov 22 14:53:47 2023 +0100
+++ b/tune/speed.c      Tue Dec 19 19:52:01 2023 +0100
@@ -346,15 +346,15 @@
   { "mpn_toom4_sqr",     speed_mpn_toom4_sqr        },
   { "mpn_toom6_sqr",     speed_mpn_toom6_sqr        },
   { "mpn_toom8_sqr",     speed_mpn_toom8_sqr        },
-  { "mpn_toom22_mul",    speed_mpn_toom22_mul       },
-  { "mpn_toom33_mul",    speed_mpn_toom33_mul       },
-  { "mpn_toom44_mul",    speed_mpn_toom44_mul       },
-  { "mpn_toom6h_mul",    speed_mpn_toom6h_mul       },
-  { "mpn_toom8h_mul",    speed_mpn_toom8h_mul       },
-  { "mpn_toom32_mul",    speed_mpn_toom32_mul       },
-  { "mpn_toom42_mul",    speed_mpn_toom42_mul       },
-  { "mpn_toom43_mul",    speed_mpn_toom43_mul       },
-  { "mpn_toom63_mul",    speed_mpn_toom63_mul       },
+  { "mpn_toom22_mul",    speed_mpn_toom22_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom33_mul",    speed_mpn_toom33_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom44_mul",    speed_mpn_toom44_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom6h_mul",    speed_mpn_toom6h_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom8h_mul",    speed_mpn_toom8h_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom32_mul",    speed_mpn_toom32_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom42_mul",    speed_mpn_toom42_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom43_mul",    speed_mpn_toom43_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom63_mul",    speed_mpn_toom63_mul, FLAG_SR_OPTIONAL },
   { "mpn_nussbaumer_mul",    speed_mpn_nussbaumer_mul    },
   { "mpn_nussbaumer_mul_sqr",speed_mpn_nussbaumer_mul_sqr},
 #if WANT_OLD_FFT_FULL
diff -r 869c08440fbd tune/speed.h
--- a/tune/speed.h      Wed Nov 22 14:53:47 2023 +0100
+++ b/tune/speed.h      Tue Dec 19 19:52:01 2023 +0100
@@ -375,15 +375,9 @@
 double speed_mpn_toom32_mul (struct speed_params *);
 double speed_mpn_toom42_mul (struct speed_params *);
 double speed_mpn_toom43_mul (struct speed_params *);
+double speed_mpn_toom53_mul (struct speed_params *);
+double speed_mpn_toom54_mul (struct speed_params *);
 double speed_mpn_toom63_mul (struct speed_params *);
-double speed_mpn_toom32_for_toom43_mul (struct speed_params *);
-double speed_mpn_toom43_for_toom32_mul (struct speed_params *);
-double speed_mpn_toom32_for_toom53_mul (struct speed_params *);
-double speed_mpn_toom53_for_toom32_mul (struct speed_params *);
-double speed_mpn_toom42_for_toom53_mul (struct speed_params *);
-double speed_mpn_toom53_for_toom42_mul (struct speed_params *);
-double speed_mpn_toom43_for_toom54_mul (struct speed_params *);
-double speed_mpn_toom54_for_toom43_mul (struct speed_params *);
 double speed_mpn_toom42_mulmid (struct speed_params *);
 double speed_mpn_mulmod_bnm1 (struct speed_params *);
 double speed_mpn_bc_mulmod_bnm1 (struct speed_params *);
@@ -1474,29 +1468,40 @@
     return t;                                                          \
   }
 
-#define SPEED_ROUTINE_MPN_MUL_TSPACE(call, tsize, minsize)             \
+#define SPEED_ROUTINE_MPN_MUL_TSPACE(function, itch, default_bn, valid)        
\
   {                                                                    \
     mp_ptr    wp, tspace;                                              \
+    mp_size_t an, bn, tn;                                              \
     unsigned  i;                                                       \
     double    t;                                                       \
     TMP_DECL;                                                          \
                                                                        \
-    SPEED_RESTRICT_COND (s->size >= minsize);                          \
+    an = s->size;                                                      \
+    bn = s->size_ratio * s->size;                                      \
+    if (bn == 0)                                                       \
+      {                                                                        
\
+       bn = (s->r == 0 ? default_bn : s->r);                           \
+       if (bn < 0) bn = -bn - an;                                      \
+      }                                                                        
\
+    SPEED_RESTRICT_COND (bn >= 1);                                     \
+    SPEED_RESTRICT_COND (an >= bn);                                    \
+    SPEED_RESTRICT_COND (valid);                                       \
+    tn = itch(an, bn);                                                 \
                                                                        \
     TMP_MARK;                                                          \
-    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);                        
\
-    SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);               \
-                                                                       \
-    speed_operand_src (s, s->xp, s->size);                             \
-    speed_operand_src (s, s->yp, s->size);                             \
-    speed_operand_dst (s, wp, 2*s->size);                              \
-    speed_operand_dst (s, tspace, tsize);                              \
+    SPEED_TMP_ALLOC_LIMBS (wp, an + bn, s->align_wp);                  \
+    SPEED_TMP_ALLOC_LIMBS (tspace, tn, s->align_wp2);                  \
+                                                                       \
+    speed_operand_src (s, s->xp, an);                                  \
+    speed_operand_src (s, s->yp, bn);                                  \
+    speed_operand_dst (s, wp, an + bn);                                        
\
+    speed_operand_dst (s, tspace, tn);                                 \
     speed_cache_fill (s);                                              \
                                                                        \
     speed_starttime ();                                                        
\
     i = s->reps;                                                       \
     do                                                                 \
-      call;                                                            \
+      function (wp, s->xp, an, s->yp, bn, tspace);                     \
     while (--i != 0);                                                  \
     t = speed_endtime ();                                              \
                                                                        \
@@ -1506,102 +1511,58 @@
 
 #define SPEED_ROUTINE_MPN_TOOM22_MUL(function)                         \
   SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, s->size, tspace),            \
-     mpn_toom22_mul_itch (s->size, s->size),                           \
-     MPN_TOOM22_MUL_MINSIZE)
+    (function, mpn_toom22_mul_itch,                                    \
+     an, 5*bn > 4*an)
 
 #define SPEED_ROUTINE_MPN_TOOM33_MUL(function)                         \
   SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, s->size, tspace),            \
-     mpn_toom33_mul_itch (s->size, s->size),                           \
-     MPN_TOOM33_MUL_MINSIZE)
+    (function, mpn_toom33_mul_itch,                                    \
+     an, bn > 2 * ((an+2) / 3))
 
 #define SPEED_ROUTINE_MPN_TOOM44_MUL(function)                         \
   SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, s->size, tspace),            \
-     mpn_toom44_mul_itch (s->size, s->size),                           \
-     MPN_TOOM44_MUL_MINSIZE)
+    (function, mpn_toom44_mul_itch,                                    \
+     an, bn > 3*((an + 3) >> 2))
 
 #define SPEED_ROUTINE_MPN_TOOM6H_MUL(function)                         \
   SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, s->size, tspace),            \
-     mpn_toom6h_mul_itch (s->size, s->size),                           \
-     MPN_TOOM6H_MUL_MINSIZE)
+  (function, mpn_toom6h_mul_itch,                                      \
+   an, bn >= 42 && ((an*3 <  bn * 8) || (bn >= 46 && an * 6 <  bn * 17)))
 
 #define SPEED_ROUTINE_MPN_TOOM8H_MUL(function)                         \
   SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, s->size, tspace),            \
-     mpn_toom8h_mul_itch (s->size, s->size),                           \
-     MPN_TOOM8H_MUL_MINSIZE)
+    (function, mpn_toom8h_mul_itch,                                    \
+     an, (bn >= 86) && an*4 <= bn*11)
 
 #define SPEED_ROUTINE_MPN_TOOM32_MUL(function)                         \
   SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace),                
\
-     mpn_toom32_mul_itch (s->size, 2*s->size/3),                       \
-     MPN_TOOM32_MUL_MINSIZE)
+    (function, mpn_toom32_mul_itch,                                    \
+     2*an / 3, bn + 2 <= an && an + 6 <= 3*bn)
 
 #define SPEED_ROUTINE_MPN_TOOM42_MUL(function)                         \
   SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),          \
-     mpn_toom42_mul_itch (s->size, s->size/2),                         \
-     MPN_TOOM42_MUL_MINSIZE)
+  (function, mpn_toom42_mul_itch,                                      \
+   an / 2, an >= 7 && bn >= 2 && an > 3*((bn+1)/2) && bn > ((an+3)/4))
 
 #define SPEED_ROUTINE_MPN_TOOM43_MUL(function)                         \
   SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace),                
\
-     mpn_toom43_mul_itch (s->size, s->size*3/4),                       \
-     MPN_TOOM43_MUL_MINSIZE)
+    (function, mpn_toom43_mul_itch,                                    \
+     an*3/4, an >= 7 && bn >= 5 && an > 3 * ((bn+2)/3) && bn > 2 * ((an+3)/4))
+
+#define SPEED_ROUTINE_MPN_TOOM53_MUL(function)                         \
+  SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
+    (function, mpn_toom53_mul_itch,                                    \
+     an*3/5, an >= 17 && bn >= 5 && an > 4 * ((bn+2)/3) && bn > 2 * ((an+4)/5))
+
+#define SPEED_ROUTINE_MPN_TOOM54_MUL(function)                         \
+  SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
+    (function, mpn_toom54_mul_itch,                                    \
+     an*4/5, an >= 17 && bn >= 10 && an > 4 * ((bn+3)/4) && bn > 3 * 
((an+4)/5))
 
 #define SPEED_ROUTINE_MPN_TOOM63_MUL(function)                         \
   SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),          \
-     mpn_toom63_mul_itch (s->size, s->size/2),                         \
-     MPN_TOOM63_MUL_MINSIZE)
-
-#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function)              \
-  SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),      \
-     mpn_toom32_mul_itch (s->size, 17*s->size/24),                     \
-     MPN_TOOM32_MUL_MINSIZE)
-#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function)              \
-  SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),      \
-     mpn_toom43_mul_itch (s->size, 17*s->size/24),                     \
-     MPN_TOOM43_MUL_MINSIZE)
-
-#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function)              \
-  SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),      \
-     mpn_toom32_mul_itch (s->size, 19*s->size/30),                     \
-     MPN_TOOM32_MUL_MINSIZE)
-#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function)              \
-  SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),      \
-     mpn_toom53_mul_itch (s->size, 19*s->size/30),                     \
-     MPN_TOOM53_MUL_MINSIZE)
-
-#define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function)              \
-  SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),      \
-     mpn_toom42_mul_itch (s->size, 11*s->size/20),                     \
-     MPN_TOOM42_MUL_MINSIZE)
-#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function)              \
-  SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),      \
-     mpn_toom53_mul_itch (s->size, 11*s->size/20),                     \
-     MPN_TOOM53_MUL_MINSIZE)
-
-#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function)              \
-  SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),        \
-     mpn_toom42_mul_itch (s->size, 5*s->size/6),                       \
-     MPN_TOOM54_MUL_MINSIZE)
-#define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function)              \
-  SPEED_ROUTINE_MPN_MUL_TSPACE                                         \
-    (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),        \
-     mpn_toom54_mul_itch (s->size, 5*s->size/6),                       \
-     MPN_TOOM54_MUL_MINSIZE)
-
+    (function, mpn_toom63_mul_itch,                                    \
+     an/2, an >= 26 && bn >= 5 && an > 5*((bn+2)/3) && bn > 2*((an+5)/6))
 
 
 #define SPEED_ROUTINE_MPN_SQR_CALL(call)                               \
diff -r 869c08440fbd tune/tuneup.c
--- a/tune/tuneup.c     Wed Nov 22 14:53:47 2023 +0100
+++ b/tune/tuneup.c     Tue Dec 19 19:52:01 2023 +0100
@@ -1417,30 +1417,36 @@
 
   param.noprint = 1;
 
-  param.function = speed_mpn_toom32_for_toom43_mul;
-  param.function2 = speed_mpn_toom43_for_toom32_mul;
+  s.r = 0;             /* clear, use size_ratio for these tests. */
+
+  s.size_ratio = 17.0 / 24;
+  param.function = speed_mpn_toom32_mul;
+  param.function2 = speed_mpn_toom43_mul;
   param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
   param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
   one (&thres, &param);
   mul_toom32_to_toom43_threshold = thres * 17 / 24;
   print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", 
mul_toom32_to_toom43_threshold);
 
-  param.function = speed_mpn_toom32_for_toom53_mul;
-  param.function2 = speed_mpn_toom53_for_toom32_mul;
+  s.size_ratio = 19.0 / 30;
+  param.function = speed_mpn_toom32_mul;
+  param.function2 = speed_mpn_toom53_mul;
   param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
   param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
   one (&thres, &param);
   mul_toom32_to_toom53_threshold = thres * 19 / 30;
   print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", 
mul_toom32_to_toom53_threshold);
 
-  param.function = speed_mpn_toom42_for_toom53_mul;
-  param.function2 = speed_mpn_toom53_for_toom42_mul;
+  s.size_ratio = 11.0 / 20;
+  param.function = speed_mpn_toom42_mul;
+  param.function2 = speed_mpn_toom53_mul;
   param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
   param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
   one (&thres, &param);
   mul_toom42_to_toom53_threshold = thres * 11 / 20;
   print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", 
mul_toom42_to_toom53_threshold);
 
+  s.size_ratio = 0.5;
   param.function = speed_mpn_toom42_mul;
   param.function2 = speed_mpn_toom63_mul;
   param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
@@ -1450,13 +1456,17 @@
   print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", 
mul_toom42_to_toom63_threshold);
 
   /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
-  param.function = speed_mpn_toom43_for_toom54_mul;
-  param.function2 = speed_mpn_toom54_for_toom43_mul;
+  s.size_ratio = 5.0 / 6;
+  param.function = speed_mpn_toom43_mul;
+  param.function2 = speed_mpn_toom54_mul;
   param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
   param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
   one (&thres, &param);
   mul_toom43_to_toom54_threshold = thres * 5 / 6;
   print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", 
mul_toom43_to_toom54_threshold);
+
+  /* Reset for other tests. */
+  s.size_ratio = 0.0;
 }
 
 

-- 
Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
Internet email is subject to wholesale government surveillance.

_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to