> * Speed up div1, presumably with a table-based approach. A quick-and-dirty but working variant below.
The table size with NBITS = 8 is 32KiB. Far too big. It avoids branches in 83% of the cases, which is not that great considering the huge cost of mispredicted branches. With some thought, I am sure the table size could come down while the accuracy could improve. #define NBITS 8 static unsigned char tab[1 << (2 * NBITS - 1)]; static unsigned char *tabp = tab - (1 << (NBITS - 1) << NBITS); mp_double_limb_t div1 (mp_limb_t n0, mp_limb_t d0) { int ncnt; size_t ni, di; mp_limb_t q0; mp_limb_t r0; mp_limb_t mask; mp_double_limb_t res; count_leading_zeros (ncnt, n0); ni = n0 >> (64 - NBITS - ncnt); di = d0 >> (64 - NBITS - ncnt); if (UNLIKELY (di < (1 << NBITS/2))) { q0 = n0 / d0; r0 = n0 - q0 * d0; } else { q0 = tabp[(ni << NBITS) + di]; r0 = n0 - q0 * d0; mask = -(mp_limb_t) (r0 >= d0); q0 -= mask; r0 -= d0 & mask; } res.d1 = q0; res.d0 = r0; return res; } void init_tab() { size_t ni, di; for (ni = 1 << (NBITS - 1); ni < (1 << NBITS); ni++) { for (di = 0; di < (1 << NBITS); di++) { tabp[(ni << NBITS) + di] = (ni << NBITS) / ((di << NBITS) + ((1 << NBITS)-1)); } } } -- Torbjörn Please encrypt, key id 0xC8601622 _______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel