Ciao,

while some discussion on division emerges... Even if completely unrelated,
I post a proposed patch to rearrange operations in udiv_qr_4by2.
It should be faster, even if I do not know how much it is actually used:
almost all thresholds in
https://gmplib.org/devel/thres/DIV_QR_2_PI2_THRESHOLD.html are
MP_SIZE_T_MAX /* never */

The patches replaces 1 add_SSSaaaa and 2 += with 1 add_SSaaaa and 1 ++.

*** /tmp/extdiff.bZsVXA/gmp.746b5528f6a5/mpn/generic/div_qr_2.c 2019-08-25
10:32:39.000000000 +0200
--- /home/bodrato/gmp/mpn/generic/div_qr_2.c    2019-08-26 21:47:22.562877220
+0200
***************
*** 111,136 ****
  /* Typically used with r1, r0 same as n3, n2. Other types of overlap
     between inputs and outputs are not supported. */
  #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)               
\
    do {                                                                        
\
      mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;             \
      mp_limb_t _t1, _t0;                                                       
\
!     mp_limb_t _c, _mask;                                              \
                                                                        \
!     umul_ppmm (_q3,_q2a, n3, di1);                                    \
      umul_ppmm (_q2,_q1, n2, di1);                                     \
      umul_ppmm (_q2c,_q1c, n3, di0);                                   \
!     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c);                    \
      umul_ppmm (_q1d,_q0, n2, di0);                                    \
!     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d);                    \
!                                                                       \
!     /* [q3,q2,q1,q0] += [n3,n2,n1,n0] + [ 0, 1, 0, 0] */              \
!     _q3 += n3;                                                                
\
!     _q0 += n0; _c = n0 > _q0;                                         \
!     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2, _c);                               
\
!     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, CNST_LIMB(1), n1);             \
                                                                        \
      umul_ppmm (_t1,_t0, _q2, d0);                                     \
      _t1 += _q2 * d1 + _q3 * d0;                                               
\
                                                                        \
      sub_ddmmss (r1, r0, n1, n0, _t1, _t0);                            \
                                                                        \
--- 111,134 ----
  /* Typically used with r1, r0 same as n3, n2. Other types of overlap
     between inputs and outputs are not supported. */
  #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)               
\
    do {                                                                        
\
      mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;             \
      mp_limb_t _t1, _t0;                                                       
\
!     mp_limb_t _mask;                                                  \
                                                                        \
!     /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */       
\
      umul_ppmm (_q2,_q1, n2, di1);                                     \
+     umul_ppmm (_q3,_q2a, n3, di1);                                    \
+     ++_q2;    /* _q2 cannot overflow */                               \
+     add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a);                           \
      umul_ppmm (_q2c,_q1c, n3, di0);                                   \
!     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c);                      \
      umul_ppmm (_q1d,_q0, n2, di0);                                    \
!     add_sssaaaa (_q2c,_q1d,_q0, _q1d,_q0, n1,n0); /* _q2c cannot
overflow */ \
!     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d);                    \
                                                                        \
      umul_ppmm (_t1,_t0, _q2, d0);                                     \
      _t1 += _q2 * d1 + _q3 * d0;                                               
\
                                                                        \
      sub_ddmmss (r1, r0, n1, n0, _t1, _t0);                            \
                                                                        \


Ĝis,
m

-- 
http://bodrato.it/papers/

_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to