Ciao,

Il Mer, 28 Agosto 2019 11:39 am, Torbjörn Granlund ha scritto:
> Marco Bodrato <bodr...@mail.dm.unipi.it> writes:

>   or maybe simply a loop that knows that U will be smaller until also V
>   will be one (some) limb shorter...
>
>   ...and finally tail-call to the smaller fixed-size asm function?
>
> I think this is a good proposal.

I implemented it, you can find it attached.

The funny thing is that with the attached code, all the chain gcd_55 ->
lowz -> gcd_44 -> lowz -> gcd_33 -> lowz is handled with tail calls.
On the other side, a great part of the code is duplicated, but it can not
exploit the fixed-size shortcuts... I'm not that sure the resulting code
is useful.

Ĝis,
m

-- 
http://bodrato.it/papers/
#include "gmp-impl.h"
#include "longlong.h"

#if defined (__amd64__) && W_TYPE_SIZE == 64
#define CMP_SWAP(ap,bp,n)						\
  do {									\
    mp_limb_t tmp;							\
    __asm__ ("cmp\t%5, %6"	"\n\t"					\
	     "mov\t%1, %0"	"\n\t"					\
	     "cmovae\t%2, %1"	"\n\t"					\
	     "cmovae\t%0, %2"						\
	     : "=&r,&r" (tmp), "=r,r" (ap), "=r,r" (bp)			\
	     : "1,1" (ap), "2,2" (bp), "r,rm" (ap[n-1]), "rm,r" (bp[n-1])); \
  } while (0)
#define badCMP_SWAP(ap,bp,n)						\
  do {									\
    mp_limb_t tmp;							\
    __asm__ ("cmp\t%5, %6"	"\n\t"					\
	     "mov\t%3, %0"	"\n\t"					\
	     "cmovae\t%4, %1"	"\n\t"					\
	     "cmovae\t%0, %2"						\
	     : "=&r,&r" (tmp), "=r,r" (ap), "=r,r" (bp)			\
	     : "r,r" (ap), "r,r" (bp), "r,rm" (ap[n-1]), "rm,r" (bp[n-1])); \
  } while (0)
#endif

#ifndef CMP_SWAP
#define CMP_SWAP(ap,bp,n)						\
  do {									\
    mp_limb_t m = -(mp_limb_t) (ap[n - 1] < bp[n - 1]);			\
    mp_limb_t *_tp = ap;						\
    ap = (mp_ptr) (((size_t) ap & ~m) | ((size_t) bp & m));		\
    bp = (mp_ptr) (((size_t) bp & ~m) | ((size_t) _tp & m));		\
  } while (0)
#endif

#ifndef CMP_SWAP
#define CMP_SWAP(ap,bp,n)						\
  do {									\
    if (ap[n - 1] < bp[n - 1])						\
      {									\
	mp_limb_t *_tp = ap; ap = bp; bp = _tp;				\
      }									\
  } while (0)
#endif


#define SMASH(a,b) a##b

void mpn_gcd_33 (mp_limb_t *, mp_limb_t *, mp_limb_t *);
void mpn_gcd_44 (mp_limb_t *, mp_limb_t *, mp_limb_t *);
void mpn_gcd_55 (mp_limb_t *, mp_limb_t *, mp_limb_t *);

mp_limb_t mpn_sub_3 (mp_ptr, mp_srcptr, mp_srcptr);
mp_limb_t mpn_sub_4 (mp_ptr, mp_srcptr, mp_srcptr);
mp_limb_t mpn_sub_5 (mp_ptr, mp_srcptr, mp_srcptr);
void mpn_rshift_3 (mp_ptr, mp_srcptr, int);
void mpn_rshift_4 (mp_ptr, mp_srcptr, int);
void mpn_rshift_5 (mp_ptr, mp_srcptr, int);

static void
lowz (mp_ptr rp, mp_ptr up, mp_ptr vp, mp_size_t n)
{
  // Handle any number of low U zeros including that U = 0
  mp_size_t i;
  ASSERT (up[0] == 0);
  ASSERT (n > 1);
  for (i = 1; up[i] == 0;)
    if (++i == n)
      {
	mpn_copyi (rp, vp, n);
	return;
      }

  if ((up[i] & 1) == 0)
    {
      int cnt;
      count_trailing_zeros (cnt, up[i]);
      mpn_rshift (up, up + i, n - i, cnt);
    }
  else
    {
      mpn_copyi (up, up + i, n - i);
    }
  MPN_FILL (up + n - i, i, 0);

  while (1)
    {
      if (UNLIKELY (vp [n - 1] == 0))
	{
	  rp [--n] = 0;
	  if (--i == 0)
	    break;
	}
      else
	{
	  vp [n - 1] -= mpn_sub_n (vp, vp, up, n - 1);

	  if (UNLIKELY (vp[0] == 0))
	    {
	      lowz (rp, vp, up, n);
	      return;
	    }
	  else
	    {
	      int cnt;
	      count_trailing_zeros (cnt, vp[0]);
	      mpn_rshift (vp, vp, n, cnt);
	    }
	}
    }

  // We've reduced the operand by at least a full limb. Call the next smaller primitive.
  switch (n)
    {
    case 1:
      {
	rp[0] = mpn_gcd_11 (up[0], vp[0]);
	break;
      }
    case 2:
      {
	mp_double_limb_t g = mpn_gcd_22 (up[1], up[0], vp[1], vp[0]);
	rp[0] = g.d0;
	rp[1] = g.d1;
	break;
      }
    case 3:
      mpn_gcd_33 (rp, up, vp);
      break;
    default:
      ASSERT (n == 4);
      mpn_gcd_44 (rp, up, vp);
      break;
    }
}

static inline void
mpn_gcd_NN (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp, size_t N)
{
  mp_limb_t cy;

  do
    {
      CMP_SWAP (up, vp, N);

#if LATER
      if ((vp[0] - up[0] & 3) != 0
	  && LIKELY (((up[N - 1] | vp[N - 1]) & GMP_NUMB_HIGHBIT) == 0))
	{
	  cy = mpn_add_n (up, up, vp, N);
	}
      else
#endif
#if HAVE_NATIVE_mpn_sub_3
      if (N == 3)
	cy = mpn_sub_3 (up, up, vp);
      else
#endif
#if HAVE_NATIVE_mpn_sub_4
      if (N == 4)
	cy = mpn_sub_4 (up, up, vp);
      else
#endif
#if HAVE_NATIVE_mpn_sub_5
      if (N == 5)
	cy = mpn_sub_5 (up, up, vp);
      else
#endif
	cy = mpn_sub_n (up, up, vp, N);

      if (UNLIKELY (cy))
	mpn_neg (up, up, N);

      if (UNLIKELY (up[0] == 0))
	{
	  lowz (rp, up, vp, N);
	  return;
	}
      else
	{
	  int cnt;
	  count_trailing_zeros (cnt, up[0]);
#if HAVE_NATIVE_mpn_rshift_3
	  if (N == 3)
	    mpn_rshift_3 (up, up, cnt);
	  else
#endif
#if HAVE_NATIVE_mpn_rshift_4
	  if (N == 4)
	    mpn_rshift_4 (up, up, cnt);
	  else
#endif
#if HAVE_NATIVE_mpn_rshift_5
	  if (N == 5)
	    mpn_rshift_5 (up, up, cnt);
	  else
#endif
	    mpn_rshift (up, up, N, cnt);
	}
    }
  while ((up[N - 1] | vp[N - 1]) != 0);

  // We've reduced the operand by a full limb. Call the next smaller primitive.
  switch (N)
    {
    case 3:
      {
	mp_double_limb_t g = mpn_gcd_22 (up[1], up[0], vp[1], vp[0]);
	rp[0] = g.d0;
	rp[1] = g.d1;
	rp[2] = 0;
	break;
      }
    case 4:
      rp[3] = 0;
      mpn_gcd_33 (rp, up, vp);
      break;
    case 5:
      rp[4] = 0;
      mpn_gcd_44 (rp, up, vp);
      break;
    }
}

void
SMASH(mpn_gcd_,33) (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp)
{
  mpn_gcd_NN (rp, up, vp, 3);
}

void
SMASH(mpn_gcd_,44) (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp)
{
  mpn_gcd_NN (rp, up, vp, 4);
}

void
SMASH(mpn_gcd_,55) (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp)
{
  mpn_gcd_NN (rp, up, vp, 5);
}
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to