Ciao,
Il Mer, 28 Agosto 2019 11:39 am, Torbjörn Granlund ha scritto:
> Marco Bodrato <bodr...@mail.dm.unipi.it> writes:
> or maybe simply a loop that knows that U will be smaller until also V
> will be one (some) limb shorter...
>
> ...and finally tail-call to the smaller fixed-size asm function?
>
> I think this is a good proposal.
I implemented it, you can find it attached.
The funny thing is that with the attached code, all the chain gcd_55 ->
lowz -> gcd_44 -> lowz -> gcd_33 -> lowz is handled with tail calls.
On the other side, a great part of the code is duplicated, but it can not
exploit the fixed-size shortcuts... I'm not that sure the resulting code
is useful.
Ĝis,
m
--
http://bodrato.it/papers/
#include "gmp-impl.h"
#include "longlong.h"
#if defined (__amd64__) && W_TYPE_SIZE == 64
#define CMP_SWAP(ap,bp,n) \
do { \
mp_limb_t tmp; \
__asm__ ("cmp\t%5, %6" "\n\t" \
"mov\t%1, %0" "\n\t" \
"cmovae\t%2, %1" "\n\t" \
"cmovae\t%0, %2" \
: "=&r,&r" (tmp), "=r,r" (ap), "=r,r" (bp) \
: "1,1" (ap), "2,2" (bp), "r,rm" (ap[n-1]), "rm,r" (bp[n-1])); \
} while (0)
#define badCMP_SWAP(ap,bp,n) \
do { \
mp_limb_t tmp; \
__asm__ ("cmp\t%5, %6" "\n\t" \
"mov\t%3, %0" "\n\t" \
"cmovae\t%4, %1" "\n\t" \
"cmovae\t%0, %2" \
: "=&r,&r" (tmp), "=r,r" (ap), "=r,r" (bp) \
: "r,r" (ap), "r,r" (bp), "r,rm" (ap[n-1]), "rm,r" (bp[n-1])); \
} while (0)
#endif
#ifndef CMP_SWAP
#define CMP_SWAP(ap,bp,n) \
do { \
mp_limb_t m = -(mp_limb_t) (ap[n - 1] < bp[n - 1]); \
mp_limb_t *_tp = ap; \
ap = (mp_ptr) (((size_t) ap & ~m) | ((size_t) bp & m)); \
bp = (mp_ptr) (((size_t) bp & ~m) | ((size_t) _tp & m)); \
} while (0)
#endif
#ifndef CMP_SWAP
#define CMP_SWAP(ap,bp,n) \
do { \
if (ap[n - 1] < bp[n - 1]) \
{ \
mp_limb_t *_tp = ap; ap = bp; bp = _tp; \
} \
} while (0)
#endif
#define SMASH(a,b) a##b
void mpn_gcd_33 (mp_limb_t *, mp_limb_t *, mp_limb_t *);
void mpn_gcd_44 (mp_limb_t *, mp_limb_t *, mp_limb_t *);
void mpn_gcd_55 (mp_limb_t *, mp_limb_t *, mp_limb_t *);
mp_limb_t mpn_sub_3 (mp_ptr, mp_srcptr, mp_srcptr);
mp_limb_t mpn_sub_4 (mp_ptr, mp_srcptr, mp_srcptr);
mp_limb_t mpn_sub_5 (mp_ptr, mp_srcptr, mp_srcptr);
void mpn_rshift_3 (mp_ptr, mp_srcptr, int);
void mpn_rshift_4 (mp_ptr, mp_srcptr, int);
void mpn_rshift_5 (mp_ptr, mp_srcptr, int);
static void
lowz (mp_ptr rp, mp_ptr up, mp_ptr vp, mp_size_t n)
{
// Handle any number of low U zeros including that U = 0
mp_size_t i;
ASSERT (up[0] == 0);
ASSERT (n > 1);
for (i = 1; up[i] == 0;)
if (++i == n)
{
mpn_copyi (rp, vp, n);
return;
}
if ((up[i] & 1) == 0)
{
int cnt;
count_trailing_zeros (cnt, up[i]);
mpn_rshift (up, up + i, n - i, cnt);
}
else
{
mpn_copyi (up, up + i, n - i);
}
MPN_FILL (up + n - i, i, 0);
while (1)
{
if (UNLIKELY (vp [n - 1] == 0))
{
rp [--n] = 0;
if (--i == 0)
break;
}
else
{
vp [n - 1] -= mpn_sub_n (vp, vp, up, n - 1);
if (UNLIKELY (vp[0] == 0))
{
lowz (rp, vp, up, n);
return;
}
else
{
int cnt;
count_trailing_zeros (cnt, vp[0]);
mpn_rshift (vp, vp, n, cnt);
}
}
}
// We've reduced the operand by at least a full limb. Call the next smaller primitive.
switch (n)
{
case 1:
{
rp[0] = mpn_gcd_11 (up[0], vp[0]);
break;
}
case 2:
{
mp_double_limb_t g = mpn_gcd_22 (up[1], up[0], vp[1], vp[0]);
rp[0] = g.d0;
rp[1] = g.d1;
break;
}
case 3:
mpn_gcd_33 (rp, up, vp);
break;
default:
ASSERT (n == 4);
mpn_gcd_44 (rp, up, vp);
break;
}
}
static inline void
mpn_gcd_NN (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp, size_t N)
{
mp_limb_t cy;
do
{
CMP_SWAP (up, vp, N);
#if LATER
if ((vp[0] - up[0] & 3) != 0
&& LIKELY (((up[N - 1] | vp[N - 1]) & GMP_NUMB_HIGHBIT) == 0))
{
cy = mpn_add_n (up, up, vp, N);
}
else
#endif
#if HAVE_NATIVE_mpn_sub_3
if (N == 3)
cy = mpn_sub_3 (up, up, vp);
else
#endif
#if HAVE_NATIVE_mpn_sub_4
if (N == 4)
cy = mpn_sub_4 (up, up, vp);
else
#endif
#if HAVE_NATIVE_mpn_sub_5
if (N == 5)
cy = mpn_sub_5 (up, up, vp);
else
#endif
cy = mpn_sub_n (up, up, vp, N);
if (UNLIKELY (cy))
mpn_neg (up, up, N);
if (UNLIKELY (up[0] == 0))
{
lowz (rp, up, vp, N);
return;
}
else
{
int cnt;
count_trailing_zeros (cnt, up[0]);
#if HAVE_NATIVE_mpn_rshift_3
if (N == 3)
mpn_rshift_3 (up, up, cnt);
else
#endif
#if HAVE_NATIVE_mpn_rshift_4
if (N == 4)
mpn_rshift_4 (up, up, cnt);
else
#endif
#if HAVE_NATIVE_mpn_rshift_5
if (N == 5)
mpn_rshift_5 (up, up, cnt);
else
#endif
mpn_rshift (up, up, N, cnt);
}
}
while ((up[N - 1] | vp[N - 1]) != 0);
// We've reduced the operand by a full limb. Call the next smaller primitive.
switch (N)
{
case 3:
{
mp_double_limb_t g = mpn_gcd_22 (up[1], up[0], vp[1], vp[0]);
rp[0] = g.d0;
rp[1] = g.d1;
rp[2] = 0;
break;
}
case 4:
rp[3] = 0;
mpn_gcd_33 (rp, up, vp);
break;
case 5:
rp[4] = 0;
mpn_gcd_44 (rp, up, vp);
break;
}
}
void
SMASH(mpn_gcd_,33) (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp)
{
mpn_gcd_NN (rp, up, vp, 3);
}
void
SMASH(mpn_gcd_,44) (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp)
{
mpn_gcd_NN (rp, up, vp, 4);
}
void
SMASH(mpn_gcd_,55) (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp)
{
mpn_gcd_NN (rp, up, vp, 5);
}
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel