Ciao, while some discussion on division emerges... Even if completely unrelated, I post a proposed patch to rearrange operations in udiv_qr_4by2. It should be faster, even if I do not know how much it is actually used: almost all thresholds in https://gmplib.org/devel/thres/DIV_QR_2_PI2_THRESHOLD.html are MP_SIZE_T_MAX /* never */
The patches replaces 1 add_SSSaaaa and 2 += with 1 add_SSaaaa and 1 ++. *** /tmp/extdiff.bZsVXA/gmp.746b5528f6a5/mpn/generic/div_qr_2.c 2019-08-25 10:32:39.000000000 +0200 --- /home/bodrato/gmp/mpn/generic/div_qr_2.c 2019-08-26 21:47:22.562877220 +0200 *************** *** 111,136 **** /* Typically used with r1, r0 same as n3, n2. Other types of overlap between inputs and outputs are not supported. */ #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \ do { \ mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \ mp_limb_t _t1, _t0; \ ! mp_limb_t _c, _mask; \ \ ! umul_ppmm (_q3,_q2a, n3, di1); \ umul_ppmm (_q2,_q1, n2, di1); \ umul_ppmm (_q2c,_q1c, n3, di0); \ ! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c); \ umul_ppmm (_q1d,_q0, n2, di0); \ ! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d); \ ! \ ! /* [q3,q2,q1,q0] += [n3,n2,n1,n0] + [ 0, 1, 0, 0] */ \ ! _q3 += n3; \ ! _q0 += n0; _c = n0 > _q0; \ ! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2, _c); \ ! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, CNST_LIMB(1), n1); \ \ umul_ppmm (_t1,_t0, _q2, d0); \ _t1 += _q2 * d1 + _q3 * d0; \ \ sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \ \ --- 111,134 ---- /* Typically used with r1, r0 same as n3, n2. Other types of overlap between inputs and outputs are not supported. */ #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \ do { \ mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \ mp_limb_t _t1, _t0; \ ! mp_limb_t _mask; \ \ ! /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */ \ umul_ppmm (_q2,_q1, n2, di1); \ + umul_ppmm (_q3,_q2a, n3, di1); \ + ++_q2; /* _q2 cannot overflow */ \ + add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a); \ umul_ppmm (_q2c,_q1c, n3, di0); \ ! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c); \ umul_ppmm (_q1d,_q0, n2, di0); \ ! add_sssaaaa (_q2c,_q1d,_q0, _q1d,_q0, n1,n0); /* _q2c cannot overflow */ \ ! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d); \ \ umul_ppmm (_t1,_t0, _q2, d0); \ _t1 += _q2 * d1 + _q3 * d0; \ \ sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \ \ Ĝis, m -- http://bodrato.it/papers/ _______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel