Below is next patch, updating the benchmarking of the toom mul functions to have a size_ratio parameter, and use that when tuning thresholds like MUL_TOOM32_TO_TOOM43_THRESHOLD. Seems to work fine for me, but it would be nice if someone more could run it and check that resulting thresholds look sane.
One thing that looks odd is /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */ for the tuning of MUL_TOOM43_TO_TOOM54_THRESHOLD. Ideal ratio for toom43 is 3/4 = 0.75, and ideal for toom54 is 4/5 = 0.8. And so 5/6 = 0.833... is higher than both. I would have expected the midpoint of 3/4 and 4/5, which would be 31/40. Regards, /Niels diff -r 869c08440fbd tune/common.c --- a/tune/common.c Wed Nov 22 14:53:47 2023 +0100 +++ b/tune/common.c Tue Dec 19 19:52:01 2023 +0100 @@ -1386,50 +1386,20 @@ SPEED_ROUTINE_MPN_TOOM43_MUL (mpn_toom43_mul); } double +speed_mpn_toom53_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM53_MUL (mpn_toom53_mul); +} +double +speed_mpn_toom54_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM54_MUL (mpn_toom54_mul); +} +double speed_mpn_toom63_mul (struct speed_params *s) { SPEED_ROUTINE_MPN_TOOM63_MUL (mpn_toom63_mul); } -double -speed_mpn_toom32_for_toom43_mul (struct speed_params *s) -{ - SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL (mpn_toom32_mul); -} -double -speed_mpn_toom43_for_toom32_mul (struct speed_params *s) -{ - SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL (mpn_toom43_mul); -} -double -speed_mpn_toom32_for_toom53_mul (struct speed_params *s) -{ - SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL (mpn_toom32_mul); -} -double -speed_mpn_toom53_for_toom32_mul (struct speed_params *s) -{ - SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL (mpn_toom53_mul); -} -double -speed_mpn_toom42_for_toom53_mul (struct speed_params *s) -{ - SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL (mpn_toom42_mul); -} -double -speed_mpn_toom53_for_toom42_mul (struct speed_params *s) -{ - SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL (mpn_toom53_mul); -} -double -speed_mpn_toom43_for_toom54_mul (struct speed_params *s) -{ - SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL (mpn_toom43_mul); -} -double -speed_mpn_toom54_for_toom43_mul (struct speed_params *s) -{ - SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL (mpn_toom54_mul); -} double speed_mpn_nussbaumer_mul (struct speed_params *s) diff -r 869c08440fbd tune/speed.c --- a/tune/speed.c Wed Nov 22 14:53:47 2023 +0100 +++ b/tune/speed.c Tue Dec 19 19:52:01 2023 +0100 @@ -346,15 +346,15 @@ { "mpn_toom4_sqr", speed_mpn_toom4_sqr }, { "mpn_toom6_sqr", speed_mpn_toom6_sqr }, { "mpn_toom8_sqr", speed_mpn_toom8_sqr }, - { "mpn_toom22_mul", speed_mpn_toom22_mul }, - { "mpn_toom33_mul", speed_mpn_toom33_mul }, - { "mpn_toom44_mul", speed_mpn_toom44_mul }, - { "mpn_toom6h_mul", speed_mpn_toom6h_mul }, - { "mpn_toom8h_mul", speed_mpn_toom8h_mul }, - { "mpn_toom32_mul", speed_mpn_toom32_mul }, - { "mpn_toom42_mul", speed_mpn_toom42_mul }, - { "mpn_toom43_mul", speed_mpn_toom43_mul }, - { "mpn_toom63_mul", speed_mpn_toom63_mul }, + { "mpn_toom22_mul", speed_mpn_toom22_mul, FLAG_SR_OPTIONAL }, + { "mpn_toom33_mul", speed_mpn_toom33_mul, FLAG_SR_OPTIONAL }, + { "mpn_toom44_mul", speed_mpn_toom44_mul, FLAG_SR_OPTIONAL }, + { "mpn_toom6h_mul", speed_mpn_toom6h_mul, FLAG_SR_OPTIONAL }, + { "mpn_toom8h_mul", speed_mpn_toom8h_mul, FLAG_SR_OPTIONAL }, + { "mpn_toom32_mul", speed_mpn_toom32_mul, FLAG_SR_OPTIONAL }, + { "mpn_toom42_mul", speed_mpn_toom42_mul, FLAG_SR_OPTIONAL }, + { "mpn_toom43_mul", speed_mpn_toom43_mul, FLAG_SR_OPTIONAL }, + { "mpn_toom63_mul", speed_mpn_toom63_mul, FLAG_SR_OPTIONAL }, { "mpn_nussbaumer_mul", speed_mpn_nussbaumer_mul }, { "mpn_nussbaumer_mul_sqr",speed_mpn_nussbaumer_mul_sqr}, #if WANT_OLD_FFT_FULL diff -r 869c08440fbd tune/speed.h --- a/tune/speed.h Wed Nov 22 14:53:47 2023 +0100 +++ b/tune/speed.h Tue Dec 19 19:52:01 2023 +0100 @@ -375,15 +375,9 @@ double speed_mpn_toom32_mul (struct speed_params *); double speed_mpn_toom42_mul (struct speed_params *); double speed_mpn_toom43_mul (struct speed_params *); +double speed_mpn_toom53_mul (struct speed_params *); +double speed_mpn_toom54_mul (struct speed_params *); double speed_mpn_toom63_mul (struct speed_params *); -double speed_mpn_toom32_for_toom43_mul (struct speed_params *); -double speed_mpn_toom43_for_toom32_mul (struct speed_params *); -double speed_mpn_toom32_for_toom53_mul (struct speed_params *); -double speed_mpn_toom53_for_toom32_mul (struct speed_params *); -double speed_mpn_toom42_for_toom53_mul (struct speed_params *); -double speed_mpn_toom53_for_toom42_mul (struct speed_params *); -double speed_mpn_toom43_for_toom54_mul (struct speed_params *); -double speed_mpn_toom54_for_toom43_mul (struct speed_params *); double speed_mpn_toom42_mulmid (struct speed_params *); double speed_mpn_mulmod_bnm1 (struct speed_params *); double speed_mpn_bc_mulmod_bnm1 (struct speed_params *); @@ -1474,29 +1468,40 @@ return t; \ } -#define SPEED_ROUTINE_MPN_MUL_TSPACE(call, tsize, minsize) \ +#define SPEED_ROUTINE_MPN_MUL_TSPACE(function, itch, default_bn, valid) \ { \ mp_ptr wp, tspace; \ + mp_size_t an, bn, tn; \ unsigned i; \ double t; \ TMP_DECL; \ \ - SPEED_RESTRICT_COND (s->size >= minsize); \ + an = s->size; \ + bn = s->size_ratio * s->size; \ + if (bn == 0) \ + { \ + bn = (s->r == 0 ? default_bn : s->r); \ + if (bn < 0) bn = -bn - an; \ + } \ + SPEED_RESTRICT_COND (bn >= 1); \ + SPEED_RESTRICT_COND (an >= bn); \ + SPEED_RESTRICT_COND (valid); \ + tn = itch(an, bn); \ \ TMP_MARK; \ - SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ - SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \ - \ - speed_operand_src (s, s->xp, s->size); \ - speed_operand_src (s, s->yp, s->size); \ - speed_operand_dst (s, wp, 2*s->size); \ - speed_operand_dst (s, tspace, tsize); \ + SPEED_TMP_ALLOC_LIMBS (wp, an + bn, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tspace, tn, s->align_wp2); \ + \ + speed_operand_src (s, s->xp, an); \ + speed_operand_src (s, s->yp, bn); \ + speed_operand_dst (s, wp, an + bn); \ + speed_operand_dst (s, tspace, tn); \ speed_cache_fill (s); \ \ speed_starttime (); \ i = s->reps; \ do \ - call; \ + function (wp, s->xp, an, s->yp, bn, tspace); \ while (--i != 0); \ t = speed_endtime (); \ \ @@ -1506,102 +1511,58 @@ #define SPEED_ROUTINE_MPN_TOOM22_MUL(function) \ SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ - mpn_toom22_mul_itch (s->size, s->size), \ - MPN_TOOM22_MUL_MINSIZE) + (function, mpn_toom22_mul_itch, \ + an, 5*bn > 4*an) #define SPEED_ROUTINE_MPN_TOOM33_MUL(function) \ SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ - mpn_toom33_mul_itch (s->size, s->size), \ - MPN_TOOM33_MUL_MINSIZE) + (function, mpn_toom33_mul_itch, \ + an, bn > 2 * ((an+2) / 3)) #define SPEED_ROUTINE_MPN_TOOM44_MUL(function) \ SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ - mpn_toom44_mul_itch (s->size, s->size), \ - MPN_TOOM44_MUL_MINSIZE) + (function, mpn_toom44_mul_itch, \ + an, bn > 3*((an + 3) >> 2)) #define SPEED_ROUTINE_MPN_TOOM6H_MUL(function) \ SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ - mpn_toom6h_mul_itch (s->size, s->size), \ - MPN_TOOM6H_MUL_MINSIZE) + (function, mpn_toom6h_mul_itch, \ + an, bn >= 42 && ((an*3 < bn * 8) || (bn >= 46 && an * 6 < bn * 17))) #define SPEED_ROUTINE_MPN_TOOM8H_MUL(function) \ SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ - mpn_toom8h_mul_itch (s->size, s->size), \ - MPN_TOOM8H_MUL_MINSIZE) + (function, mpn_toom8h_mul_itch, \ + an, (bn >= 86) && an*4 <= bn*11) #define SPEED_ROUTINE_MPN_TOOM32_MUL(function) \ SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace), \ - mpn_toom32_mul_itch (s->size, 2*s->size/3), \ - MPN_TOOM32_MUL_MINSIZE) + (function, mpn_toom32_mul_itch, \ + 2*an / 3, bn + 2 <= an && an + 6 <= 3*bn) #define SPEED_ROUTINE_MPN_TOOM42_MUL(function) \ SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ - mpn_toom42_mul_itch (s->size, s->size/2), \ - MPN_TOOM42_MUL_MINSIZE) + (function, mpn_toom42_mul_itch, \ + an / 2, an >= 7 && bn >= 2 && an > 3*((bn+1)/2) && bn > ((an+3)/4)) #define SPEED_ROUTINE_MPN_TOOM43_MUL(function) \ SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace), \ - mpn_toom43_mul_itch (s->size, s->size*3/4), \ - MPN_TOOM43_MUL_MINSIZE) + (function, mpn_toom43_mul_itch, \ + an*3/4, an >= 7 && bn >= 5 && an > 3 * ((bn+2)/3) && bn > 2 * ((an+3)/4)) + +#define SPEED_ROUTINE_MPN_TOOM53_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_TSPACE \ + (function, mpn_toom53_mul_itch, \ + an*3/5, an >= 17 && bn >= 5 && an > 4 * ((bn+2)/3) && bn > 2 * ((an+4)/5)) + +#define SPEED_ROUTINE_MPN_TOOM54_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_TSPACE \ + (function, mpn_toom54_mul_itch, \ + an*4/5, an >= 17 && bn >= 10 && an > 4 * ((bn+3)/4) && bn > 3 * ((an+4)/5)) #define SPEED_ROUTINE_MPN_TOOM63_MUL(function) \ SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ - mpn_toom63_mul_itch (s->size, s->size/2), \ - MPN_TOOM63_MUL_MINSIZE) - -#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function) \ - SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ - mpn_toom32_mul_itch (s->size, 17*s->size/24), \ - MPN_TOOM32_MUL_MINSIZE) -#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function) \ - SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ - mpn_toom43_mul_itch (s->size, 17*s->size/24), \ - MPN_TOOM43_MUL_MINSIZE) - -#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function) \ - SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ - mpn_toom32_mul_itch (s->size, 19*s->size/30), \ - MPN_TOOM32_MUL_MINSIZE) -#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function) \ - SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ - mpn_toom53_mul_itch (s->size, 19*s->size/30), \ - MPN_TOOM53_MUL_MINSIZE) - -#define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function) \ - SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ - mpn_toom42_mul_itch (s->size, 11*s->size/20), \ - MPN_TOOM42_MUL_MINSIZE) -#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function) \ - SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ - mpn_toom53_mul_itch (s->size, 11*s->size/20), \ - MPN_TOOM53_MUL_MINSIZE) - -#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function) \ - SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \ - mpn_toom42_mul_itch (s->size, 5*s->size/6), \ - MPN_TOOM54_MUL_MINSIZE) -#define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function) \ - SPEED_ROUTINE_MPN_MUL_TSPACE \ - (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \ - mpn_toom54_mul_itch (s->size, 5*s->size/6), \ - MPN_TOOM54_MUL_MINSIZE) - + (function, mpn_toom63_mul_itch, \ + an/2, an >= 26 && bn >= 5 && an > 5*((bn+2)/3) && bn > 2*((an+5)/6)) #define SPEED_ROUTINE_MPN_SQR_CALL(call) \ diff -r 869c08440fbd tune/tuneup.c --- a/tune/tuneup.c Wed Nov 22 14:53:47 2023 +0100 +++ b/tune/tuneup.c Tue Dec 19 19:52:01 2023 +0100 @@ -1417,30 +1417,36 @@ param.noprint = 1; - param.function = speed_mpn_toom32_for_toom43_mul; - param.function2 = speed_mpn_toom43_for_toom32_mul; + s.r = 0; /* clear, use size_ratio for these tests. */ + + s.size_ratio = 17.0 / 24; + param.function = speed_mpn_toom32_mul; + param.function2 = speed_mpn_toom43_mul; param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD"; param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17; one (&thres, ¶m); mul_toom32_to_toom43_threshold = thres * 17 / 24; print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold); - param.function = speed_mpn_toom32_for_toom53_mul; - param.function2 = speed_mpn_toom53_for_toom32_mul; + s.size_ratio = 19.0 / 30; + param.function = speed_mpn_toom32_mul; + param.function2 = speed_mpn_toom53_mul; param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD"; param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19; one (&thres, ¶m); mul_toom32_to_toom53_threshold = thres * 19 / 30; print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold); - param.function = speed_mpn_toom42_for_toom53_mul; - param.function2 = speed_mpn_toom53_for_toom42_mul; + s.size_ratio = 11.0 / 20; + param.function = speed_mpn_toom42_mul; + param.function2 = speed_mpn_toom53_mul; param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD"; param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11; one (&thres, ¶m); mul_toom42_to_toom53_threshold = thres * 11 / 20; print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold); + s.size_ratio = 0.5; param.function = speed_mpn_toom42_mul; param.function2 = speed_mpn_toom63_mul; param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD"; @@ -1450,13 +1456,17 @@ print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold); /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */ - param.function = speed_mpn_toom43_for_toom54_mul; - param.function2 = speed_mpn_toom54_for_toom43_mul; + s.size_ratio = 5.0 / 6; + param.function = speed_mpn_toom43_mul; + param.function2 = speed_mpn_toom54_mul; param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD"; param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5; one (&thres, ¶m); mul_toom43_to_toom54_threshold = thres * 5 / 6; print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold); + + /* Reset for other tests. */ + s.size_ratio = 0.0; } -- Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677. Internet email is subject to wholesale government surveillance. _______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel